# This code comes with the Hands-On for the Introduction to Spark
# - some examples are adaptions from Advanced Analytics with Spark (Sandy Ryza et al.)
# - some examples are adaptions form the Databricks API examples
#
# AI and Predictive Analytics in Datacenter Environments
# http://dcai.bsc.es

# Download HUS datasets:
#
# wget http://bit.ly/2jZgeZY --output-document csv_hus.zip
# mkdir hus
# mv csv_hus.zip hus/
# cd hus
# unzip csv_hus.zip


## SPARK SQL EXAMPLES

# Load File
df = spark.read.csv("/home/vagrant/hus/ss13husa.csv", header=True, mode="DROPMALFORMED", inferSchema = True)

# Basic Operations
df.take(5)
df.printSchema()
df.count()

# Apply Relational Queries
df1 = df.limit(10)
df1.show()

slice1 = df.select("SERIALNO","RT","DIVISION","REGION")
slice1.show()

slice2 = df.select("SERIALNO","RT","DIVISION","REGION").filter("PUMA > 2600")
slice2.show()
slice2.count()

slice2 = df.select("SERIALNO","RT","DIVISION","REGION").filter("PUMA > 2600 AND REGION = 3")
slice2.count()

df.groupBy("DIVISION").count().show()
df.groupBy("DIVISION").max("SERIALNO").show()

# Apply SQL Queries
df.createGlobalTempView("husa")
spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa").show()
spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa WHERE PUMA < 2100").show()
spark.sql("SELECT DIVISION, COUNT(*) FROM global_temp.husa GROUP BY DIVISION").show()


df.select("SERIALNO","RT","DIVISION","REGION").show()
spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa").show()

df.select("SERIALNO","RT","DIVISION","REGION").filter("PUMA > 2600").show()
spark.sql("SELECT SERIALNO, RT, DIVISION, REGION FROM global_temp.husa WHERE PUMA < 2100").show()

df.groupBy("DIVISION").count().show()
spark.sql("SELECT DIVISION, COUNT(*) FROM global_temp.husa GROUP BY DIVISION").show()

# Save Files
slice2.write.format("csv").save("/home/vagrant/hus/slice2.csv")
slice2.write.parquet("/home/vagrant/hus/slice2.parquet")
pqSlice2 = spark.read.parquet("/home/vagrant/hus/slice2.parquet")