Search This Blog

Tuesday, 25 March 2025

Google colab hadoop pyspark

 # Step 1: Install PySpark and Hadoop dependencies

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q http://apache.mirrors.lucidnetworks.net/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

!tar xvf spark-3.1.2-bin-hadoop3.2.tgz

!pip install findspark


# Step 2: Set environment variables

import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


# Step 3: Initialize PySpark

import findspark

findspark.init()


from pyspark.sql import SparkSession


# Create the Spark session

spark = SparkSession.builder.master("local").appName("WordCount").getOrCreate()


# Step 4: Create a sample text input (simulating a file)

input_data = [

    "Hello Spark",

    "This is a test for Word Count",

    "Spark is great for big data processing",

    "Word Count is a simple example"

]


# Parallelize the list to create an RDD

rdd = spark.sparkContext.parallelize(input_data)


# Step 5: Perform Word Count (MapReduce)

word_pairs = rdd.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1))

word_count = word_pairs.reduceByKey(lambda x, y: x + y)


# Step 6: Collect and print results

result = word_count.collect()

for word, count in result:

    print(f"{word}: {count}")


# Step 7: Save the results to a local file (simulating HDFS save)

word_count.saveAsTextFile("/content/word_count_output")


No comments:

Post a Comment

Hadoop Analytics

NLP BASICS

  1. What is NLP? NLP is a field of artificial intelligence (AI) that focuses on the interaction between computers and human languages. Its...