Professional Documents
Culture Documents
Spark Best Practices
Spark Best Practices
sqlContext.table(“table_schema.table_name”).filt
er(col(“prod_cd ”)in (‘’,’’)
df.filter(…)
# I have removed the count and show actions
from the production code
df.groupBy()
df.filter()
df.count()
df.groupBy()
df.show()
df.registerTempTable("temp_table_name")
df.write.parquet("path+filename")
df.filter(…)
df.join(df2,…)
Df.groupBy()
d
df.join(df2,…)
Df.groupBy()
df.filter()
sqlContext.table(“db.table”).select([“co1”,”col2”])
sqlContext.table(“db.table”)