# Step 1: Install PySpark and Hadoop dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.mirrors.lucidnetworks.net/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xvf spark-3.1.2-bin-hadoop3.2.tgz
!pip install findspark
# Step 2: Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"
# Step 3: Initialize PySpark
import findspark
findspark.init()
from pyspark.sql import SparkSession
# Create the Spark session
spark = SparkSession.builder.master("local").appName("WordCount").getOrCreate()
# Step 4: Create a sample text input (simulating a file)
input_data = [
"Hello Spark",
"This is a test for Word Count",
"Spark is great for big data processing",
"Word Count is a simple example"
]
# Parallelize the list to create an RDD
rdd = spark.sparkContext.parallelize(input_data)
# Step 5: Perform Word Count (MapReduce)
word_pairs = rdd.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1))
word_count = word_pairs.reduceByKey(lambda x, y: x + y)
# Step 6: Collect and print results
result = word_count.collect()
for word, count in result:
print(f"{word}: {count}")
# Step 7: Save the results to a local file (simulating HDFS save)
word_count.saveAsTextFile("/content/word_count_output")
No comments:
Post a Comment