20.03 ~ 20.08 국비교육/Data (Hadoop, R)

[Hadoop] Wordcount

찹키리 2020. 6. 25. 13:04

<Mapper>

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
package wikibooks.hadoop.chapter04;
 
import java.io.IOException;
import java.util.StringTokenizer;
 
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
 
public class WordCountMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
//데이터 입력 객체 2개, 출력 객체 2개
//제네릭 사용
    
    private final static IntWritable one = new IntWritable(1);
    //숫자 1을 상수 one에 담음
    private Text word = new Text();
    
    public void map(LongWritable key, Text value, Context context)
    //실제로 map해주는 함수 행번호, 줄 단위 내용, 전달객체
    throws IOException, InterruptedException {
        StringTokenizer itr = new StringTokenizer(value.toString());
        //문자열 분리, Iterator 형으로 만들어 word에 저장 후 전달
        while(itr.hasMoreTokens()) {
            word.set(itr.nextToken());
            //저장
            context.write(word, one);
            //

 

 

 

 

 

<Reducer>

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
package wikibooks.hadoop.chapter04;
 
import java.io.IOException;
 
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
 
public class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
//Mapper에서 전달되어 옴
//입력 객체 2개, 출력 객체 2개
    private IntWritable result = new IntWritable();
    //정수로 사용할 수 있는 변수 result
    
    public void reduce
(Text key, Iterable<IntWritable> values, Context context)
//반복형(Iterator)으로 받음
    throws IOException, InterruptedException {
        int sum = 0;
        for(IntWritable val : values) {
            sum += val.get();
        //누적변수
        //객체 상태의 숫자를 정수 값으로 추출
        }
        result.set(sum);
        //누적변수
        context.write(key, result);
        //key와 누적변수 

 

 

 

 

 

<Driver Class>

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
package wikibooks.hadoop.chapter04;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 
public class WordCount {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        //하둡 환경을 사용하는 객체 생성
        if(args.length != 2) {
        //인자가 두 개인지 확인
            System.err.println("Usage: WordCount <input> <output>");
            System.exit(2);
        }
        Job job = new Job(conf, "WordCount");
                        //환경, job의 이름
        
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        //세 개의 클래스 넣음
        
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        //들어오고, 나가는 데이터 text로 포맷한다.
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //key, value 지정
        
        FileInputFormat.addInputPath(job, new Path(args[0]));
        //빅데이터 소스의 위치를 job에 알려준다.
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        //내보낼 위치를 job에게 알려준다.
        
        job.waitForCompletion(true);
        //작업이 끝날 때까지 

 

 

 

 

 

<Hadoop 실행>

 

 

 

미리 생성한 input.txt의 내용을 bin 아래에 있는 input.txt에 담는다.

담은 input.txt를 wordcount_output 폴더에 옮긴다.

 

 

 

 

생성 확인

 

 

 

 

cat 명령을 사용해 데이터를 읽는다.