filePath = "dbfs:/databricks-datasets/SPARK_README.md" # path in Databricks File System lines = sc.textFile(filePath) # read the file into the cluster lines.take(10) # display first 10 lines in the file
Out[10]:
[u'# Apache Spark',
u'',
u'Spark is a fast and general cluster computing system for Big Data. It provides',
u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
u'supports general computation graphs for data analysis. It also supports a',
u'rich set of higher-level tools including Spark SQL for SQL and DataFrames,',
u'MLlib for machine learning, GraphX for graph processing,',
u'and Spark Streaming for stream processing.',
u'',
u'<http://spark.apache.org/>']
numPartitions = lines.getNumPartitions() # get the number of partitions print "Number of partitions (workers) storing the dataset = %d" % numPartitions
Number of partitions (workers) storing the dataset = 2
words = lines.flatMap(lambda x: x.split(' ')) # split each line into a list of words words.take(10) # display the first 10 words
Out[12]:
[u'#',
u'Apache',
u'Spark',
u'',
u'Spark',
u'is',
u'a',
u'fast',
u'and',
u'general']
stopWords = ['','a','*','and','is','of','the','a'] # define the list of stop words filteredWords = words.filter(lambda x: x.lower() not in stopWords) # filter the words filteredWords.take(10) # display the first 10 filtered words
Out[13]:
[u'#',
u'Apache',
u'Spark',
u'Spark',
u'fast',
u'general',
u'cluster',
u'computing',
u'system',
u'for']
filteredWords.cache() # cache filtered dataset into memory across the cluster worker nodes
Out[14]: PythonRDD[39] at RDD at PythonRDD.scala:48
word1Tuples = filteredWords.map(lambda x: (x, 1)) # map the words into (word,1) tuples word1Tuples.take(10) # display the (word,1) tuples
Out[15]:
[(u'#', 1),
(u'Apache', 1),
(u'Spark', 1),
(u'Spark', 1),
(u'fast', 1),
(u'general', 1),
(u'cluster', 1),
(u'computing', 1),
(u'system', 1),
(u'for', 1)]
wordCountTuples = word1Tuples.reduceByKey(lambda x, y: x + y) # aggregate counts for each word wordCountTuples.take(10) # display the first 10 (word,count) tuples
Out[16]:
[(u'when', 1),
(u'"local"', 1),
(u'through', 1),
(u'computation', 1),
(u'using:', 1),
(u'guidance', 2),
(u'Scala,', 1),
(u'environment', 1),
(u'only', 1),
(u'rich', 1)]
sortedWordCountTuples = wordCountTuples.top(10,key=lambda (x, y): y) # top 10 (word,count) tuples for tuple in sortedWordCountTuples: # display the top 10 (word,count) tuples by count print str(tuple)
(u'to', 14)
(u'Spark', 13)
(u'for', 11)
(u'##', 8)
(u'run', 7)
(u'can', 6)
(u'on', 5)
(u'in', 5)
(u'you', 4)
(u'if', 4)
from pyspark.sql import Row # import the pyspark sql Row class wordCountRows = wordCountTuples.map(lambda p: Row(word=p[0], count=int(p[1]))) # tuples -> Rows wordCountRows.toDF().createOrReplaceTempView("word_count")
WordCount Example
Goal: Determine the most popular words in a given text file using Python and SQL
Last refresh: Never