# This code comes with the Hands-On for the Introduction to Spark # - some examples are adaptions from Advanced Analytics with Spark (Sandy Ryza et al.) # - some examples are adaptions form the Databricks API examples # # AI and Predictive Analytics in Datacenter Environments # http://dcai.bsc.es # Download HUS datasets: # # wget http://bit.ly/2jZgeZY --output-document csv_hus.zip # mkdir hus # mv csv_hus.zip hus/ # cd hus # unzip csv_hus.zip ## SPARK SQL EXAMPLES # Load File df = spark.read.csv("/home/vagrant/hus/ss13husa.csv", header=True, mode="DROPMALFORMED", inferSchema = True) # Basic Operations df.take(5) df.printSchema() df.count() # Apply Relational Queries df1 = df.limit(10) df1.show() slice1 = df.select("SERIALNO","RT","DIVISION","REGION") slice1.show() slice2 = df.select("SERIALNO","RT","DIVISION","REGION").filter("PUMA > 2600") slice2.show() slice2.count() slice2 = df.select("SERIALNO","RT","DIVISION","REGION").filter("PUMA > 2600 AND REGION = 3") slice2.count() df.groupBy("DIVISION").count().show() df.groupBy("DIVISION").max("SERIALNO").show() # Apply SQL Queries df.createGlobalTempView("husa") spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa").show() spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa WHERE PUMA < 2100").show() spark.sql("SELECT DIVISION, COUNT(*) FROM global_temp.husa GROUP BY DIVISION").show() df.select("SERIALNO","RT","DIVISION","REGION").show() spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa").show() df.select("SERIALNO","RT","DIVISION","REGION").filter("PUMA > 2600").show() spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa WHERE PUMA < 2100").show() df.groupBy("DIVISION").count().show() spark.sql("SELECT DIVISION, COUNT(*) FROM global_temp.husa GROUP BY DIVISION").show() # Save Files slice2.write.format("csv").save("/home/vagrant/hus/slice2.csv") slice2.write.parquet("/home/vagrant/hus/slice2.parquet") pqSlice2 = spark.read.parquet("/home/vagrant/hus/slice2.parquet")