mapreduce 自定义分区的实现

悠悠 2023-06-22 13:46 4阅读 0赞

### mapreduce 自定义分区的实现 ###

*   *   *   *  题目：
             *  分析：
             *  程序框架如下：
             *  MyPartitioner类的代码实现如下：
             *  FlowPartition类的代码实现如下：
             *  总结：

#### 题目： ####

有如下流量数据，要求按照手机号归属地进行分区。  
其中：  
134—136 归属地属于上海  
137—138 归属地属于北京  
139—159 归属地属于深圳  
其它 归属地未知

手机号			上行流量		下行流量		总流量
    13480253104		2494800		2494800		4989600
    13502468823		101663100	1529437140	1631100240
    13560436666		15467760	13222440	28690200
    13560439658		28191240	81663120	109854360
    13602846565		26860680	40332600	67193280
    13660577991		96465600	9563400		106029000
    13719199419		3326400		0			3326400
    13726230503		34386660	342078660	376465320
    13726238888		34386660	342078660	376465320
    13760778710		1663200		1663200		3326400
    13826544101		3659040		0			3659040
    13922314466		41690880	51559200	93250080
    13925057413		153263880	668647980	821911860
    13926251106		3326400		0			3326400
    13926435656		1829520		20956320	22785840
    15013685858		50713740	49036680	99750420
    15920133257		43742160	40692960	84435120
    15989002119		26860680	2494800		29355480
    18211575961		21164220	29189160	50353380
    18320173382		132099660	33430320	165529980
    84138413		57047760	19847520	76895280

#### 分析： ####

此题，默认的分区算法，无法实现，需要进行自定义分区  
自定义分区流程如下：

1）定义一个类  继承  Partitioner
    2）重写 getPartition
    3）在job中，指定自定义分区类
    	job.setPartitionerClass(cls); 
    4）指定reducetask的个数  不指定默认值运行一个
    	job.setNumReduceTasks(4);

#### 程序框架如下： ####

FlowPartition类实现mr过程  
MyPartitioner类实现自定义分区

#### MyPartitioner类的代码实现如下： ####

package com.aura.cn.partition;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Partitioner;
    /**
    *@class MyPartitioner.java
    *@author Samuel 
    *@version v1.0
    *@date 2019年12月16日上午10:49:30
    *@description    自定义分区
    *按照手机号进行分区
    *
     * 分区类
     * 按照手机号分区
     * 		134---136   上海  part-r-00000   0
    		137---138   北京  part-r-00001   1
    		139---159   深圳  part-r-00002   2
    		其他    	未名  part-r-00003   3
    		4 个reducetasks -- 4个分区
    	1）继承  partitioner
    	2)getPartitioner
    	
    	泛型：map输出的k  v
    
    */
    public class MyPartitioner extends Partitioner<Text, Text>{
        
    	/*
    	 * numPartitions 默认情况下，等于 reducetask 的个数
    	 */
    	@Override
    	public int getPartition(Text key, Text value, int numPartitions) {
        
    		String prefix = key.toString().substring(0,3);
    //		int prefix1 = Integer.parseInt(prefix);
    //		if(prefix1>=134 && prefix1<=136) {
        
    //			用此种方式也可以。
    //		}
    		if(prefix.compareTo("134")>=0 && prefix.compareTo("136")<=0) {
        
    			return 0;
    		}else if(prefix.compareTo("137")>=0 && prefix.compareTo("138")<=0){
        
    			return 1;
    		}else if(prefix.compareTo("139")>=0 && prefix.compareTo("159")<=0){
        
    			return 2;
    		}else {
        
    			return 3;
    		}
    	}
    }

#### FlowPartition类的代码实现如下： ####

package com.aura.cn.partition;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    /**
    *@class FlowPartition.java
    *@author Samuel 
    *@version v1.0
    *@date 2019年12月16日上午10:21:04
    *@description   实现mr过程
    */
    public class FlowPartition {
        
    	/*
    	 * 	
    	 *	map：
    			key：手机号
    			value：Text （其他内容）
    		shuffle：
    			自定义分区
    			按照手机号归属地  分区
    			排序  分组
    		一行内容为：13480253104	2494800	2494800	4989600
    	 */
    
    	static class PartitionMapper extends Mapper<LongWritable, Text, Text, Text>{
        
    		Text mk = new Text();
    		Text mv = new Text();
    		@Override
    		protected void map(LongWritable key, Text value, Context context)
    				throws IOException, InterruptedException {
        
    			String line = value.toString();
    			String[] datas = line.split("\t");
    			mk.set(datas[0]);
    			String subValue = line.substring(line.indexOf(datas[1]));//获取value的值
    			mv.set(subValue);
    			context.write(mk, mv);//发送
    		}
    	}
    	
    	/*
    	 * shuffle过程:
    	 * 	1)分区  map key前3位
    	 * 134---136   
    	 * 137---138
    	 * 139---159
    	 **  未知
    	 * 
    	 * 2）排序  map key
    	 * 每一个分区内部  分别排序
    	 *	13480253104	2494800		2494800		4989600
    		13502468823	101663100	1529437140	1631100240
    		13560436666	15467760	13222440	28690200
    		13560439658	28191240	81663120	109854360
    		13602846565	26860680	40332600	67193280
    		13660577991	96465600	9563400		106029000
    	3）分组  map key
    		相同手机号分在一组
    		
    	reduce端：
    		shuffle已经完成对每一个分区的数据进行排序、分组 
    		所以reduce端直接输出即可
    	 */
    	
    	
    	// 每一个reducetask, 都会实例化PartitionReducer。  每一个对象中， 反复调用reduce方法。
    	//PartitionReducer 这个类，针对的是每一个分区。
    	static class PartitionReducer extends Reducer<Text, Text, Text, NullWritable>{
        
    		Text rk = new Text();
    		@Override
    		protected void reduce(Text key, Iterable<Text> values,Context context)
    				throws IOException, InterruptedException {
        
    			//循环遍历输出
    			for (Text v : values) {
        
    				rk.set(key.toString()+"\t"+v.toString());
    				context.write(rk, NullWritable.get());
    			}
    		}
    	}
    	
    	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        
    		Configuration conf = new Configuration(); //获取集群配置
    		Job job = Job.getInstance();//构建job对象
    		job.setJarByClass(FlowPartition.class);//指定jar包运行主类
    		job.setMapperClass(PartitionMapper.class);//指定map类
    		job.setReducerClass(PartitionReducer.class);//指定reduce类
    		job.setMapOutputKeyClass(Text.class); //指定map输出的 key的类
    		job.setMapOutputValueClass(Text.class);//指定map输出的 value的类
    		job.setOutputKeyClass(Text.class);//指定reduce输出的 key的类
    		job.setOutputValueClass(NullWritable.class);//指定reduce输出的 value的类
    		
    		//指定分区类
    		job.setPartitionerClass(MyPartitioner.class);
    		//指定reducetask的个数
    		job.setNumReduceTasks(4);
    		
    		FileInputFormat.addInputPath(job, new Path("D:\\bd1904\\data\\{flowout/*}") ); //指定输入路径
    		
    		FileSystem fs = FileSystem.get(conf); 
    		Path outPath =new Path("D:\\bd1904\\data\\flowout_partition");
    		if(fs.exists(outPath)) {
        
    			fs.delete(outPath, true);
    		}
    		FileOutputFormat.setOutputPath(job, outPath); //指定输出路径
    		job.waitForCompletion(true);//提交， 打印日志
    	}
    }

#### 总结： ####

注意：  
1）自定义分区中，分区和reducetask是一 一对应的， 因此，分区编号一定要和reducetask的编号一 一对应。  
2）reducetask的编号，默认从0开始，顺序递增的。

虽然自定义分区中，分区编号是可以自己定义返回值的，不一定要顺序递增。但是出于性能考虑，分区编号最好是顺序递增的，reducetask的设置和分区个数相同，否则必然有reducetask在执行空跑。  
reducetask设置为 **1** 或者大于返回值最大值，程序可以跑通。如果reducetask设置的值不大于自定义返回值的最大值，则会报 **Illegal partition for …** 的错误。