Create Input File in Ubuntu
nano matrix_input.txtA,0,0,1 A,0,1,2 A,1,0,3 A,1,1,4 B,0,0,5 B,0,1,6 B,1,0,7 B,1,1,8Save and exit.CTRL + OENTERCTRL + X=================================================================================Step 2: Create Mapper Script
nano mapper.pymapper.py
#!/usr/bin/env python3
import sys
# Define matrix sizes
# A is 2x2
# B is 2x2
n = 2 # common dimension
for line in sys.stdin:
line = line.strip()
matrix, i, j, value = line.split(",")
i = int(i)
j = int(j)
value = float(value)
if matrix == "A":
for col in range(n):
print(f"{i},{col}\tA,{j},{value}")
else: # Matrix B
for row in range(n):
print(f"{row},{j}\tB,{i},{value}")Make it executable:
chmod +x mapper.py=========================================================================Step 3: Create Reducer Script
nano reducer.pyreducer.py
#!/usr/bin/env python3
import sys
from collections import defaultdict
current_key = None
values = []
def compute_result(key, values):
A = defaultdict(float)
B = defaultdict(float)
for val in values:
matrix, k, value = val.split(",")
k = int(k)
value = float(value)
if matrix == "A":
A[k] = value
else:
B[k] = value
result = 0
for k in A:
result += A[k] * B[k]
print(f"{key}\t{result}")
for line in sys.stdin:
line = line.strip()
key, val = line.split("\t")
if key != current_key:
if current_key:
compute_result(current_key, values)
current_key = key
values = []
values.append(val)
if current_key:
compute_result(current_key, values)Make it executable:
chmod +x reducer.py=================================================================Step 4: Test Locally (Without Hadoop)
cat matrix_input.txt | ./mapper.py | sort | ./reducer.pyExpected Output:
0,0 19.0
0,1 22.0
1,0 43.0
1,1 50.0==============================================================Step 5: Run Using Hadoop Streaming
Create HDFS directory
hdfs dfs -mkdir /matrixUpload input file
hdfs dfs -put matrix_input.txt /matrixRun Hadoop Streaming Job
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming*.jar \
-input /matrix/matrix_input.txt \
-output /matrix_output \
-mapper mapper.py \
-reducer reducer.py \
-file mapper.py \
-file reducer.py
Step 6: View Output
hdfs dfs -cat /matrix_output/part-00000
No comments:
Post a Comment