Hadoop常用算法


package com.cuiweiyou.test;

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;     // 注意这两个类的路径，和之前版本不同
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//import org.apache.hadoop.mapred.FileInputFormat;  // 如果使用这两个类，要求JobConf
//import org.apache.hadoop.mapred.FileOutputFormat;

public class WordCount {
    /**
     * 先经过mapper运算，然后才是reducer。
     * 内部类：映射器 Mapper<Key_IN, Value_IN, Key_OUT, Value_OUT>
     * 首先读取源文本
     */
　　public static class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
　　　　//占位体，1，查到一个就占个坑
　　　　private static final IntWritable one = new IntWritable(1);
　　　　//文本
　　　　private Text word = new Text();
        /** 
         * 重写map方法，实现理想效果
         * MyMapper的实例只有一个，但实例的这个map方法却一直在执行，直到读取结束
         * Key1：本行首字符在全文中的索引。Value1：本行的文本。context：上下文对象，在整个wordcount运算生命周期内存活
         * 这里K1、V1像这样[K,V]
        **/
　　　　public void map(Object key1, Text value1, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
　　　　　　//拆分字符串，返回单词集合。默认以空格和换行/回车拆分
　　　　　　StringTokenizer itr = new StringTokenizer(value1.toString());
　　　　　　//遍历一行的全部单词
　　　　　　while (itr.hasMoreTokens()) {
　　　　　　　　//将文本转为临时Text变量
　　　　　　　　this.word.set(itr.nextToken());
　　　　　　　　//将单词保存到上下文对象中（单词，占位体），输出
　　　　　　　　context.write(this.word, one);
　　　　　　}
　　　　}
　　}

    /************************************************************************
     *  在Mapper后，Reducer前，有个shuffle过程，会根据k2将对应的v2归并为v2[...]  *
     *  www.cuiweiyou.com
     *************************************************************************/
    
    /**
     * mapper结束后，执行现在的reducer。
     * 内部类：拆分器 Reducer<Key_IN, Value_IN, Key_OUT, Value_OUT>
     */
　　public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
　　　　 //计数器。个数统计
　　　　private IntWritable result = new IntWritable();
　　　　
        /** 
         * 重写reduce方法，实现理想效果
         * MyReducer的实例也只有一个，但实例的这个reduce方法却一直在执行，直到完成统计
         * Key2：单词。Values2：value的集合，也就是[1,1,1,...]。context：上下文对象
         * 这里这里K2、V2像这样[K,V[1,1,1,...]]。每执行一次，key就是一个新单词，对应的values就是其全部占位体
        **/
　　　　public void reduce(Text key2, Iterable<IntWritable> values2, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
　　　　　　
　　　　　　int sum = 0;
　　　　　　
　　　　　　//累加V2的元素，有多少个 占位体1 ，即有多少个指定单词
　　　　　　for (IntWritable val : values2) {
　　　　　　　　sum += val.get();
　　　　　　}
　　　　　　this.result.set(sum);
　　　　　　//终于将单词和总个数再次输出
　　　　　　context.write(key2, this.result);　　// 输出到 hdfs:/output 中到结果文件
　　　　}
　　}

　　public static void main(String[] args) throws Exception {
　　　　// HDFS配置
　　　　Configuration conf = new Configuration();
　　　　conf.set("fs.defaultFS", "hdfs://localhost:9000");

　　　　// 作业（环境，作业名）
　　　　Job job = Job.getInstance(conf, "word count");
　　　　job.setJarByClass(WordCount.class);　　　　　　// 执行作业的类
　　　　job.setMapperClass(MyMapper.class);　　　　　　// 读取源数据，执行map运算的类
　　　　/* Combiner
　　　　 * 通常，每一个map可能会产生大量的输出，combiner的作用就是在map端对输出先做一次合并，以减少传输到reducer的数据量。
　　　　 * combiner的输入输出类型必须和mapper的输出以及reducer的输入类型一致 */
　　　　job.setCombinerClass(MyReducer.class);　　　　// 统计数据，执行reduce的类
　　　　job.setReducerClass(MyReducer.class);　　　　 // 统计数据，执行reduce的类
　　　　job.setOutputKeyClass(Text.class);　　　　　　// 设置输出的key类型，和Context上下文对象write的参数类型一致
　　　　job.setOutputValueClass(IntWritable.class);　// 设置输出的value类型

　　　　FileInputFormat.addInputPath(job, new Path("hdfs:/input"));　　　　// 源数据路径，须已存在
　　　　FileOutputFormat.setOutputPath(job, new Path("hdfs:/output"));　　// 统计结果输出路径，须程序自动创建

　　　　// 等待提交作业到集群并完成，才结束程序
　　　　System.exit(job.waitForCompletion(true) ? 0 : 1);
　　}
}

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

package com.cuiweiyou.test;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; // 注意这两个类的路径，和之前版本不同

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

//import org.apache.hadoop.mapred.FileInputFormat; // 如果使用这两个类，要求JobConf

//import org.apache.hadoop.mapred.FileOutputFormat;

public class WordCount {

/**

* 先经过mapper运算，然后才是reducer。

* 内部类：映射器 Mapper<Key_IN, Value_IN, Key_OUT, Value_OUT>

* 首先读取源文本

*/

　　public static class MyMapper extends Mapper<Object, Text, Text, IntWritable> {

　　　　//占位体，1，查到一个就占个坑

　　　　private static final IntWritable one = new IntWritable(1);

　　　　//文本

　　　　private Text word = new Text();

/**

* 重写map方法，实现理想效果

* MyMapper的实例只有一个，但实例的这个map方法却一直在执行，直到读取结束

* Key1：本行首字符在全文中的索引。Value1：本行的文本。context：上下文对象，在整个wordcount运算生命周期内存活

* 这里K1、V1像这样[K,V]

**/

　　　　public void map(Object key1, Text value1, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {

　　　　　　//拆分字符串，返回单词集合。默认以空格和换行/回车拆分

　　　　　　StringTokenizer itr = new StringTokenizer(value1.toString());

　　　　　　//遍历一行的全部单词

　　　　　　while (itr.hasMoreTokens()) {

　　　　　　　　//将文本转为临时Text变量

　　　　　　　　this.word.set(itr.nextToken());

　　　　　　　　//将单词保存到上下文对象中（单词，占位体），输出

　　　　　　　　context.write(this.word, one);

　　　　　　}

/************************************************************************

* 在Mapper后，Reducer前，有个shuffle过程，会根据k2将对应的v2归并为v2[...] *

* www.cuiweiyou.com

*************************************************************************/

/**

* mapper结束后，执行现在的reducer。

* 内部类：拆分器 Reducer<Key_IN, Value_IN, Key_OUT, Value_OUT>

*/

　　public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

　　　　 //计数器。个数统计

　　　　private IntWritable result = new IntWritable();

/**

* 重写reduce方法，实现理想效果

* MyReducer的实例也只有一个，但实例的这个reduce方法却一直在执行，直到完成统计

* Key2：单词。Values2：value的集合，也就是[1,1,1,...]。context：上下文对象

* 这里这里K2、V2像这样[K,V[1,1,1,...]]。每执行一次，key就是一个新单词，对应的values就是其全部占位体

**/

　　　　public void reduce(Text key2, Iterable<IntWritable> values2, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {

　　　　　　int sum = 0;

　　　　　　//累加V2的元素，有多少个占位体1 ，即有多少个指定单词

　　　　　　for (IntWritable val : values2) {

　　　　　　　　sum += val.get();

　　　　　　}

　　　　　　this.result.set(sum);

　　　　　　//终于将单词和总个数再次输出

　　　　　　context.write(key2, this.result);　　// 输出到 hdfs:/output 中到结果文件

　　　　}

　　public static void main(String[] args) throws Exception {

　　　　// HDFS配置

　　　　Configuration conf = new Configuration();

　　　　conf.set("fs.defaultFS", "hdfs://localhost:9000");

　　　　// 作业（环境，作业名）

　　　　Job job = Job.getInstance(conf, "word count");

　　　　job.setJarByClass(WordCount.class);　　　　　　// 执行作业的类

　　　　job.setMapperClass(MyMapper.class);　　　　　　// 读取源数据，执行map运算的类

　　　　/* Combiner

　　　　 * 通常，每一个map可能会产生大量的输出，combiner的作用就是在map端对输出先做一次合并，以减少传输到reducer的数据量。

　　　　 * combiner的输入输出类型必须和mapper的输出以及reducer的输入类型一致 */

　　　　job.setCombinerClass(MyReducer.class);　　　　// 统计数据，执行reduce的类

　　　　job.setReducerClass(MyReducer.class);　　　　 // 统计数据，执行reduce的类

　　　　job.setOutputKeyClass(Text.class);　　　　　　// 设置输出的key类型，和Context上下文对象write的参数类型一致

　　　　job.setOutputValueClass(IntWritable.class);　// 设置输出的value类型

　　　　FileInputFormat.addInputPath(job, new Path("hdfs:/input"));　　　　// 源数据路径，须已存在

　　　　FileOutputFormat.setOutputPath(job, new Path("hdfs:/output"));　　// 统计结果输出路径，须程序自动创建

　　　　// 等待提交作业到集群并完成，才结束程序

　　　　System.exit(job.waitForCompletion(true) ? 0 : 1);

　　}


package com.cuiweiyou.test;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

//hadoop默认排序：
//如果k2、v2类型是Text-文本，结果是按照字典顺序
//如果k2、v2类型是LongWritable-数字，结果是按照数字大小顺序

public class Sort {
    /**
     * 内部类：映射器 Mapper<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>
     * 读数据
     */
    public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
        /**
         * 重写map方法
         * 每行一个数字，每次读一行
         */
        public void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
            //k1是行号，无用，舍弃k1
        　　// 这里文本v1转为数字k2。null为新的v2
            context.write(new LongWritable(Long.parseLong(v1.toString())), NullWritable.get());
            // 新的k2可能有重复，但没有保存对应的占位体
        }
    }

    /*** 在此方法执行前，有个shuffle过程，会根据k2将对应的v2归并为v2[...] ***/

    /**
     * 内部类：拆分器 Reducer<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>
     */
    public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
        /**
         * 重写reduce方法
         */
        protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {
            //数字k2转为结果k3, v2[...]舍弃
            context.write(k2, NullWritable.get());
            //此时，k3如果发生重复，根据默认算法会发生覆盖，即最终仅保存一个k3
        }
    }

    public static void main(String[] args) throws Exception {
　　　　// HDFS配置
　　　　Configuration conf = new Configuration();
　　　　conf.set("fs.defaultFS", "hdfs://localhost:9000");

　　　　// 作业（环境，作业名）
　　　　Job job = Job.getInstance(conf, "SortTest");
　　　　job.setJarByClass(Sort.class);　　　　　　　　　// 执行作业的类
　　　　job.setMapperClass(MyMapper.class);　　　　　　 // 读取源数据，执行map运算的类
　　　　job.setCombinerClass(MyReducer.class);　　　　 // 统计数据，执行reduce的类
　　　　job.setReducerClass(MyReducer.class);　　　　  // 统计数据，执行reduce的类
　　　　
　　　　job.setOutputKeyClass(LongWritable.class);　　　　// 设置输出的key类型，和Context上下文对象write的参数类型一致
　　　　job.setOutputValueClass(NullWritable.class);　　  // 设置输出的value类型

　　　　FileInputFormat.addInputPath(job, new Path("hdfs:/input"));　　　　// 源数据路径，须已存在
　　　　FileOutputFormat.setOutputPath(job, new Path("hdfs:/output"));　　 // 统计结果输出路径，须程序自动创建

　　　　// 等待提交作业到集群并完成，才结束程序
　　　　System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

package com.cuiweiyou.test;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

//hadoop默认排序：

//如果k2、v2类型是Text-文本，结果是按照字典顺序

//如果k2、v2类型是LongWritable-数字，结果是按照数字大小顺序

public class Sort {

/**

* 内部类：映射器 Mapper<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>

* 读数据

*/

public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

/**

* 重写map方法

* 每行一个数字，每次读一行

*/

public void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {

//k1是行号，无用，舍弃k1

　　// 这里文本v1转为数字k2。null为新的v2

context.write(new LongWritable(Long.parseLong(v1.toString())), NullWritable.get());

// 新的k2可能有重复，但没有保存对应的占位体

}

/*** 在此方法执行前，有个shuffle过程，会根据k2将对应的v2归并为v2[...] ***/

/**

* 内部类：拆分器 Reducer<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>

*/

public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {

/**

* 重写reduce方法

*/

protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {

//数字k2转为结果k3, v2[...]舍弃

context.write(k2, NullWritable.get());

//此时，k3如果发生重复，根据默认算法会发生覆盖，即最终仅保存一个k3

}

public static void main(String[] args) throws Exception {

　　　　// HDFS配置

　　　　Configuration conf = new Configuration();

　　　　conf.set("fs.defaultFS", "hdfs://localhost:9000");

　　　　// 作业（环境，作业名）

　　　　Job job = Job.getInstance(conf, "SortTest");

　　　　job.setJarByClass(Sort.class);　　　　　　　　　// 执行作业的类

　　　　job.setMapperClass(MyMapper.class);　　　　　　 // 读取源数据，执行map运算的类

　　　　job.setCombinerClass(MyReducer.class);　　　　 // 统计数据，执行reduce的类

　　　　job.setReducerClass(MyReducer.class);　　　　 // 统计数据，执行reduce的类

　　　　job.setOutputKeyClass(LongWritable.class);　　　　// 设置输出的key类型，和Context上下文对象write的参数类型一致

　　　　job.setOutputValueClass(NullWritable.class);　　 // 设置输出的value类型

　　　　FileInputFormat.addInputPath(job, new Path("hdfs:/input"));　　　　// 源数据路径，须已存在

　　　　FileOutputFormat.setOutputPath(job, new Path("hdfs:/output"));　　 // 统计结果输出路径，须程序自动创建

　　　　// 等待提交作业到集群并完成，才结束程序

　　　　System.exit(job.waitForCompletion(true) ? 0 : 1);

}


关键代码
/*
     * 内部类：映射器 Mapper<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>
     */
    public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
        /****
         * 重写map方法
        ****/
        public void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
            //因为我们读入的数据就是一行一个数字，直接使用
            //这个数字有几个都无所谓，只有知道有这么一个数字即可，所以输出的v2为null
            context.write(new LongWritable(Long.parseLong(v1.toString())), NullWritable.get());
        }
    }
    
    /** 在此方法执行前，有个shuffle过程，会根据k2将对应的v2归并为v2[...] **/

    /*
     * 内部类：拆分器 Reducer<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>
     */
    public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
        /****
         * 重写reduce方法
        ****/
        protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {
            //此时，k3（即眼前的k2）如果发生重复，根据默认算法会发生覆盖，即最终仅保存一个k3，达到去重到效果，而v3是null无所谓
            context.write(k2, NullWritable.get());

        }
    }

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

关键代码

/*

* 内部类：映射器 Mapper<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>

*/

public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

/****

* 重写map方法

****/

public void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {

//因为我们读入的数据就是一行一个数字，直接使用

//这个数字有几个都无所谓，只有知道有这么一个数字即可，所以输出的v2为null

context.write(new LongWritable(Long.parseLong(v1.toString())), NullWritable.get());

}

/** 在此方法执行前，有个shuffle过程，会根据k2将对应的v2归并为v2[...] **/

/*

* 内部类：拆分器 Reducer<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>

*/

public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {

/****

* 重写reduce方法

****/

protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {

//此时，k3（即眼前的k2）如果发生重复，根据默认算法会发生覆盖，即最终仅保存一个k3，达到去重到效果，而v3是null无所谓

context.write(k2, NullWritable.get());

}


关键代码
/*
     * 内部类：映射器 Mapper<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>
     */
    public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        String tmp = "8238";
        
        /**
         * 重写map方法。k1：行首字符索引，v1：这一行文本
        **/
        protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException ,InterruptedException {
            System.out.println(v1+", "+tmp);
            //如果行文本是指定值，过滤之
            if(v1.toString().equals(tmp)){
                System.out.println("有了");
                //保存（按照泛型限制，k2是Text，v2是Nullritable）
                context.write(v1, NullWritable.get());
            }
        }
    }

    /*
     * 内部类：拆分器 Reducer<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>
     */
    public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
        /**
         * 重写reduce方法
        **/
        protected void reduce(Text k2, Iterable<NullWritable> v2, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException ,InterruptedException {
            context.write(k2, NullWritable.get());
        }
    }

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

关键代码

/*

* 内部类：映射器 Mapper<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>

*/

public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

String tmp = "8238";

/**

* 重写map方法。k1：行首字符索引，v1：这一行文本

**/

protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException ,InterruptedException {

System.out.println(v1+", "+tmp);

//如果行文本是指定值，过滤之

if(v1.toString().equals(tmp)){

System.out.println("有了");

//保存（按照泛型限制，k2是Text，v2是Nullritable）

context.write(v1, NullWritable.get());

}

/*

* 内部类：拆分器 Reducer<KEY_IN, VALUE_IN, KEY_OUT, VALUE_OUT>

*/

public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {

/**

* 重写reduce方法

**/

protected void reduce(Text k2, Iterable<NullWritable> v2, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException ,InterruptedException {

context.write(k2, NullWritable.get());

}


关键代码
// map(泛型定义了输入和输出类型)
    public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

        // 首先创建一个临时变量，保存一个可存储的最小值：Long.MIN_VALUE=-9223372036854775808
        long temp = Long.MIN_VALUE;

        // 找出最大值。这个map不断迭代v1，最终保存最大值
        protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
            
            // 将文本转数值
            long val = Long.parseLong(v1.toString());
            // 如果v1比临时变量大，则保存v1的值
            if (temp < val) {
                temp = val;
            }
        }

        /** ---此方法在全部的map任务结束后执行一次。这时仅输出临时变量到最大值--- **/
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(temp), NullWritable.get());
            System.out.println("文件读取完毕，保存最大值");    //输出两次，对应两个文本文件
        }
    }

    // reduce
    public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
        // 临时变量
        Long temp = Long.MIN_VALUE;

        // 因为一个文件得到一个最大值，我们有两个txt文件会得到两个值。再次将这些值比对，得到最大的
        protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {

            long val = Long.parseLong(k2.toString());
            // 如果k2比临时变量大，则保存k2的值
            if (temp < val) {
                temp = val;
            }
        }

        /** ！！！此方法在全部的reduce任务结束后执行一次。这时仅输出唯一最大值！！！ **/
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(temp), NullWritable.get());
        }
    }

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

关键代码

// map(泛型定义了输入和输出类型)

public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

// 首先创建一个临时变量，保存一个可存储的最小值：Long.MIN_VALUE=-9223372036854775808

long temp = Long.MIN_VALUE;

// 找出最大值。这个map不断迭代v1，最终保存最大值

protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {

// 将文本转数值

long val = Long.parseLong(v1.toString());

// 如果v1比临时变量大，则保存v1的值

if (temp < val) {

temp = val;

}

/** ---此方法在全部的map任务结束后执行一次。这时仅输出临时变量到最大值--- **/

protected void cleanup(Context context) throws IOException, InterruptedException {

context.write(new LongWritable(temp), NullWritable.get());

System.out.println("文件读取完毕，保存最大值"); //输出两次，对应两个文本文件

}

// reduce

public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {

// 临时变量

Long temp = Long.MIN_VALUE;

// 因为一个文件得到一个最大值，我们有两个txt文件会得到两个值。再次将这些值比对，得到最大的

protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {

long val = Long.parseLong(k2.toString());

// 如果k2比临时变量大，则保存k2的值

if (temp < val) {

temp = val;

}

/** ！！！此方法在全部的reduce任务结束后执行一次。这时仅输出唯一最大值！！！ **/

protected void cleanup(Context context) throws IOException, InterruptedException {

context.write(new LongWritable(temp), NullWritable.get());

}


	// map
    public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
    	
    	public MyMapper() {
			System.err.println("MyMapper实例化......");
		}

        // 首先创建一个临时变量，保存一个可存储的最小值：Long.MIN_VALUE=-9223372036854775808
        long temp = Long.MIN_VALUE;
        // Top5存储空间，我们取前5个
        long[] tops;

        /** 这个方法在run中调用，在全部map之前执行一次 **/
        protected void setup(Context context) {
            // 初始化数组长度为5
            tops = new long[5];
            
            System.err.println("Mapper-setup执行。。。");
        }

        // 找出最大值
        public void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
            
            // 将文本转数值
            final long val = Long.parseLong(v1.toString());
            System.err.println("map读到：" + val);
            
            // 保存在0索引
            if(val > tops[0])
            	tops[0] = val;
            
            // 排序后最大值在最后一个索引，这样从[5]到[0]依次减小。每执行一次map，最小的[0]都会赋予新值
            Arrays.sort(tops);
            
            System.err.println("map ing ---" + Arrays.toString(tops));
        }

        /** ---此方法在全部的map任务结束后执行一次。输出map后得到的前5个最大值--- **/
        protected void cleanup(Context context) throws IOException, InterruptedException {
            for (int i = 0; i < tops.length; i++) {
                context.write(new LongWritable(tops[i]), NullWritable.get());
            }
            
            System.err.println("Mapper-cleanup处理。。。");
        }
    }

    // reduce
    public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
    	public MyReducer() {
			System.err.println("MyReduce instance ...");
		}
    	
        Long temp = Long.MIN_VALUE;
        long[] tops;

        /** 次方法在run中调用，在全部map之前执行一次 **/
        protected void setup(Context context) {
            tops = new long[5];
            
            System.err.println("MyReduce-setup...");
        }

        // 因为每个文件都得到5个值，再次将这些值比对，得到最大的
        protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {
            long top = Long.parseLong(k2.toString());
            System.err.println("reduce读到：" + top);
            
            if (top>tops[0])
            	tops[0] = top;
            
            Arrays.sort(tops);
            
            System.err.println("reduce ing ---" + Arrays.toString(tops));
        }

        /** ---此方法在全部到reduce任务结束后执行一次--- **/
        protected void cleanup(Context context) throws IOException, InterruptedException {
            for (int i = 0; i < tops.length; i++) {
                context.write(new LongWritable(tops[i]), NullWritable.get());
            }
            
            System.err.println("MyReduce-cleanup...");
        }
    }

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

// map

public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

public MyMapper() {

System.err.println("MyMapper实例化......");

}

// 首先创建一个临时变量，保存一个可存储的最小值：Long.MIN_VALUE=-9223372036854775808

long temp = Long.MIN_VALUE;

// Top5存储空间，我们取前5个

long[] tops;

/** 这个方法在run中调用，在全部map之前执行一次 **/

protected void setup(Context context) {

// 初始化数组长度为5

tops = new long[5];

System.err.println("Mapper-setup执行。。。");

}

// 找出最大值

public void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {

// 将文本转数值

final long val = Long.parseLong(v1.toString());

System.err.println("map读到：" + val);

// 保存在0索引

if(val > tops[0])

tops[0] = val;

// 排序后最大值在最后一个索引，这样从[5]到[0]依次减小。每执行一次map，最小的[0]都会赋予新值

Arrays.sort(tops);

System.err.println("map ing ---" + Arrays.toString(tops));

}

/** ---此方法在全部的map任务结束后执行一次。输出map后得到的前5个最大值--- **/

protected void cleanup(Context context) throws IOException, InterruptedException {

for (int i = 0; i < tops.length; i++) {

context.write(new LongWritable(tops[i]), NullWritable.get());

}

System.err.println("Mapper-cleanup处理。。。");

}

// reduce

public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {

public MyReducer() {

System.err.println("MyReduce instance ...");

}

Long temp = Long.MIN_VALUE;

long[] tops;

/** 次方法在run中调用，在全部map之前执行一次 **/

protected void setup(Context context) {

tops = new long[5];

System.err.println("MyReduce-setup...");

}

// 因为每个文件都得到5个值，再次将这些值比对，得到最大的

protected void reduce(LongWritable k2, Iterable<NullWritable> v2, Context context) throws IOException, InterruptedException {

long top = Long.parseLong(k2.toString());

System.err.println("reduce读到：" + top);

if (top>tops[0])

tops[0] = top;

Arrays.sort(tops);

System.err.println("reduce ing ---" + Arrays.toString(tops));

}

/** ---此方法在全部到reduce任务结束后执行一次--- **/

protected void cleanup(Context context) throws IOException, InterruptedException {

for (int i = 0; i < tops.length; i++) {

context.write(new LongWritable(tops[i]), NullWritable.get());

}

System.err.println("MyReduce-cleanup...");

}


#  第一次 map，第一个txt文件
MyMapper实例化......
Mapper-setup执行。。。
map读到：8764
map ing ---[0, 0, 0, 0, 8764]
map读到：7367
map ing ---[0, 0, 0, 7367, 8764]
map读到：3498
map ing ---[0, 0, 3498, 7367, 8764]
map读到：483275
map ing ---[0, 3498, 7367, 8764, 483275]
map读到：632300
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：3450
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：10
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：4
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：8
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：33
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：5
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：8
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：6
map ing ---[3498, 7367, 8764, 483275, 632300]
map读到：45
map ing ---[3498, 7367, 8764, 483275, 632300]
Mapper-cleanup处理。。。

#  第一次 reduce，第一个txt文件map后的结果
MyReduce instance ...
MyReduce-setup...
reduce读到：3498
reduce ing ---[0, 0, 0, 0, 3498]
reduce读到：7367
reduce ing ---[0, 0, 0, 3498, 7367]
reduce读到：8764
reduce ing ---[0, 0, 3498, 7367, 8764]
reduce读到：483275
reduce ing ---[0, 3498, 7367, 8764, 483275]
reduce读到：632300
reduce ing ---[3498, 7367, 8764, 483275, 632300]
MyReduce-cleanup...

#  第二次 map，第二个txt文件
MyMapper实例化......
Mapper-setup执行。。。
map读到：8764
map ing ---[0, 0, 0, 0, 8764]
map读到：7367
map ing ---[0, 0, 0, 7367, 8764]
map读到：3498
map ing ---[0, 0, 3498, 7367, 8764]
map读到：483275
map ing ---[0, 3498, 7367, 8764, 483275]
map读到：6323
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：3450
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：10
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：4
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：8
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：33
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：5
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：8
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：6
map ing ---[3498, 6323, 7367, 8764, 483275]
map读到：45
map ing ---[3498, 6323, 7367, 8764, 483275]
Mapper-cleanup处理。。。

#  第二次 reduce，第二个txt文件map后的结果
MyReduce instance ...
MyReduce-setup...
reduce读到：3498
reduce ing ---[0, 0, 0, 0, 3498]
reduce读到：6323
reduce ing ---[0, 0, 0, 3498, 6323]
reduce读到：7367
reduce ing ---[0, 0, 3498, 6323, 7367]
reduce读到：8764
reduce ing ---[0, 3498, 6323, 7367, 8764]
reduce读到：483275
reduce ing ---[3498, 6323, 7367, 8764, 483275]
MyReduce-cleanup...

#  第三次 reduce，两次map运算txt文件的结果
MyReduce instance ...
MyReduce-setup...
reduce读到：3498
reduce ing ---[0, 0, 0, 0, 3498]
reduce读到：6323
reduce ing ---[0, 0, 0, 3498, 6323]
reduce读到：7367
reduce ing ---[0, 0, 3498, 6323, 7367]
reduce读到：8764
reduce ing ---[0, 3498, 6323, 7367, 8764]
reduce读到：483275
reduce ing ---[3498, 6323, 7367, 8764, 483275]
reduce读到：632300
reduce ing ---[6323, 7367, 8764, 483275, 632300]
MyReduce-cleanup...

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

# 第一次 map，第一个txt文件

MyMapper实例化......

Mapper-setup执行。。。

map读到：8764

map ing ---[0, 0, 0, 0, 8764]

map读到：7367

map ing ---[0, 0, 0, 7367, 8764]

map读到：3498

map ing ---[0, 0, 3498, 7367, 8764]

map读到：483275

map ing ---[0, 3498, 7367, 8764, 483275]

map读到：632300