first create these files in your terminal using cat command
cat > employees.csv
1, John Doe, 101, 50000
2, Jane Smith, 102, 60000
3, Jim Brown, 101, 55000
4, Jake White, 103, 70000
cat > departments.csv
101, HR
102, Engineering
103, Marketing
move to hadoop directory
>hadoop fs -put employees.csv emp.csv
>hadoop fs -put departments.csv dept.csv
> vi pig1.pig
employees = LOAD 'emp.csv' USING PigStorage(',') AS (employee_id:int, name:chararray,
department_id:int, salary:float);
dump employees;
> pig pig1.pig
> vi pig2.pig
departments
= LOAD 'dept.csv'
USING PigStorage(',') AS
(department_id:int,
department_name:chararray);
dump departments;
>pig pig2.pig
>vi pig3.pig
employees = LOAD 'emp.csv' USING PigStorage(',') AS (employee_id:int, name:chararray,
department_id:int, salary:float);
high_salary_employees = FILTER employees
BY salary > 55000;
dump high_salary_employees;
--
Project only the name and salary of high salary employees
projected_employees = FOREACH high_salary_employees
GENERATE name, salary;
-- Group employees by department_id
grouped_employees = GROUP employees BY
department_id;
-- Join employees with departments to get
department names
joined_data = JOIN employees BY
department_id, departments BY department_id;
-- Sort employees by salary in descending
order
sorted_employees = ORDER employees BY
salary DESC;
-- Store the results of the sorted
employees
STORE sorted_employees INTO
'sorted_employees_output' USING PigStorage(',');
No comments:
Post a Comment