hadoop案例：partition分区-蒲公英云

文章目录

- 输入数据
- 期望结果
- 需求分析
- - 自定义PhoneFlowBean
- 自定义MyPartitioner
- Mapper类
- Reducer类
- Driver类
- 执行结果

输入数据

1    13736230513    192.196.100.1    www.hadoop.com    2481    24681    200
2    13846544121    192.196.100.2            264    0    200
3     13956435636    192.196.100.3            132    1512    200
4     13966251146    192.168.100.1            240    0    404
5     18271575951    192.168.100.2    www.hadoop.com    1527    2106    200
6     84188413    192.168.100.3    www.hadoop.com    4116    1432    200
7     13590439668    192.168.100.4            1116    954    200
8     15910133277    192.168.100.5    www.hao123.com    3156    2936    200
9     13729199489    192.168.100.6            240    0    200
10     13630577991    192.168.100.7    www.shouhu.com    6960    690    200
11     15043685818    192.168.100.8    www.baidu.com    3659    3538    200
12     15959002129    192.168.100.9    www.hadoop.com    1938    180    500
13     13560439638    192.168.100.10            918    4938    200
14     13470253144    192.168.100.11            180    180    200
15     13682846555    192.168.100.12    www.qq.com    1938    2910    200
16     13992314666    192.168.100.13    www.gaga.com    3008    3720    200
17     13509468723    192.168.100.14    www.qinghua.com    7335    110349    404
18     18390173782    192.168.100.15    www.sogou.com    9531    2412    200
19     13975057813    192.168.100.16    www.baidu.com    11058    48243    200
20     13768778790    192.168.100.17            120    120    200
21     13568436656    192.168.100.18    www.alibaba.com    2481    24681    200
22     13568436656    192.168.100.19            1116    954    200

期望结果

期望输出数据
手机号136、137、138、139开头都分别放到一个独立的4个文件中，其他开头的放到一个文件中。

需求分析

在这里插入图片描述

自定义PhoneFlowBean

package com.mr.mypartition;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class PhoneFlowBean implements Writable { 
    // 上传流量
    private long upFlow;
    // 下载流量
    private long downFlow;
    // 总流量 = 上传流量+下载流量
    private long sumFlow;
    public PhoneFlowBean() { 
    }
    public PhoneFlowBean(long upFlow, long downFlow) { 
        this.upFlow = upFlow;
        this.downFlow = downFlow;
        this.sumFlow = upFlow + downFlow;
    }
    public void write(DataOutput out) throws IOException { 
        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);
    }
    public void readFields(DataInput in) throws IOException { 
        this.upFlow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();
    }
    @Override
    public String toString() { 
        return upFlow +
                "\t" + downFlow +
                "\t" + sumFlow;
    }
    public void setUpFlow(long upFlow) { 
        this.upFlow = upFlow;
    }
    public void setDownFlow(long downFlow) { 
        this.downFlow = downFlow;
    }
    public void setSumFlow(long sumFlow) { 
        this.sumFlow = sumFlow;
    }
    public long getUpFlow() { 
        return upFlow;
    }
    public long getDownFlow() { 
        return downFlow;
    }
    public long getSumFlow() { 
        return sumFlow;
    }
}

自定义MyPartitioner

package com.mr.mypartition;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class MyPartitioner extends Partitioner<Text, PhoneFlowBean> { 
    // 根据手机号的前三位返回不同的分区号
    @Override
    public int getPartition(Text text, PhoneFlowBean phoneFlowBean, int numPartitions) { 
        String substring = text.toString().substring(0, 3);
        if ("136".equals(substring)) { 
            return 0;
        } else if ("137".equals(substring)) { 
            return 1;
        } else if ("138".equals(substring)) { 
            return 2;
        } else if ("139".equals(substring)) { 
            return 3;
        } else { 
            return 4;
        }
    }
}

Mapper类

package com.mr.mypartition;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class PhoneFlowMapper extends Mapper<LongWritable, Text, Text, PhoneFlowBean> { 
// 4. 定义对象，防止每次执行map方法重复创建对象
    Text phoneNumberText = new Text();
    PhoneFlowBean phoneFlowBean = new PhoneFlowBean();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 
        // 1. 读取每行文本数据，按照分隔符进行切割
        String string = value.toString();
        String[] split = string.split("\t");
        // 2. 抽取手机号，上行和下行流量
        String phoneNumber = split[1];
        String upFlow = split[split.length - 3];
        String downFlow = split[split.length - 2];
// 3. 组装 flowBean对象
        phoneFlowBean.setUpFlow(Long.parseLong(upFlow));
        phoneFlowBean.setDownFlow(Long.parseLong(downFlow));
// flowBean.setSumFlow(Long.parseLong(upFlow)+Long.parseLong(downFlow));
        phoneNumberText.set(phoneNumber);
// FlowBean flowBean = new FlowBean(Long.parseLong(upFlow), Long.parseLong(downFlow));
// 5. 往reducer写出手机号，flowBean对象
        context.write(phoneNumberText, phoneFlowBean);
    }
}

Reducer类

package com.mr.mypartition;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class PhoneFlowReducer extends Reducer<Text, PhoneFlowBean, Text, PhoneFlowBean> { 
// 4. 将创建FlowBean对象的方法提取出来
    PhoneFlowBean phoneFlowBean = new PhoneFlowBean();
    @Override
    protected void reduce(Text key, Iterable<PhoneFlowBean> values, Context context) throws IOException, InterruptedException { 
// 1. 定义上行总流量和下行总流量
        long upTotal = 0;
        long downTotal = 0;
// 2. 迭代 values（封装的是一个个FlowBean对象，根据手机号来分组）
        for (PhoneFlowBean value : values) { 
            upTotal += value.getUpFlow();
            downTotal += value.getDownFlow();
        }
// 3. 继续封装FlowBean 对象
        phoneFlowBean.setUpFlow(upTotal);
        phoneFlowBean.setDownFlow(downTotal);
        phoneFlowBean.setSumFlow(upTotal + downTotal);
// 5. 输出结果
        context.write(key, phoneFlowBean);
    }
}

Driver类

package com.mr.mypartition;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PhoneFlowDriver { 
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 
        // 0 指定路径
        args = new String[]{ "E:\\Hadoop\\src\\main\\resources\\input\\iphone", "E:\\Hadoop\\src\\main\\resources\\mypart"};
// 1 获取配置信息configuration以及封装任务job
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
// 2 设置Driver加载路径 setJarByClass
        job.setJarByClass(PhoneFlowDriver.class);
// 3 设置map和reduce类 setMaper setReducer
        job.setMapperClass(PhoneFlowMapper.class);
        job.setReducerClass(PhoneFlowReducer.class);
// 4 设置map输出 setmapoutputkey setmapoutputvalue
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PhoneFlowBean.class);
// 5 设置最终输出kv类型 (reducer的输出kv类型) setoutoutkey setoutputvalue
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(PhoneFlowBean.class);
// 设置自定义的分区器
        job.setPartitionerClass(MyPartitioner.class);
// 设置reduceTask 的个数，影响最终的文件个数
        job.setNumReduceTasks(5);
// 6 设置本地的输入和输出路径 fileinputformat.setinputpath
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7 提交
        boolean completion = job.waitForCompletion(true);
        System.exit(completion ? 0 : 1);
    }
}