1from pyspark import SparkConf, SparkContext
2from pyspark.sql import SQLContext
3Sc = SparkContext()
4sqlContext = SQLContext(sc)
1# Use the Spark CSV datasource with options specifying:
2# - First line of file is a header
3# - Automatically infer the schema of the data
4data = spark.read.format("csv")
5 .option("header", "true")
6 .option("inferSchema", "true")
7 .load("/databricks-datasets/samples/population-vs-price/data_geo.csv")
8
9data.cache() # Cache data for faster reuse