无任何干货,仅供复制
程序说明:
1. 分析一个应该的访问日志文件,找出每个用户ID的访问次数。日志格式基本上是:"2012-10-26 14:41:30,748 userNameId-777 from IP-10.232.25.144 invoked URL-http://xxx/hello.jsonp"
2. Standalone模式,但直接用maven项目所依赖的hadoop库,你不必再另装hadoop
<!-- pom.xml --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-core</artifactId> <version>1.0.4</version> </dependency>
//Mapper
public class Coupon11LogMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException {
String line = value.toString();
String accessRegex = ".*userNameId\\-(\\d+).*";
Pattern pattern = Pattern.compile(accessRegex);
Matcher matcher = pattern.matcher(line);
if (!matcher.find()) {
return;
}
String userNameId = matcher.group(1);
context.write(new Text(userNameId), new LongWritable(1l));
};
}
//Reducer
public class Coupon11LogReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
Long sum = 0l;
for (LongWritable value : values) {
sum = sum + value.get();
}
context.write(key, new LongWritable(sum));
}
}
//Job Runner
public class Coupon11LogJobMain {
public static void main(String[] args) throws Exception {
String inputFile = "/home/kent/dev/hadoop/bigdata/coupon11/coupon11.log";
String outDir = "/home/kent/dev/hadoop/bigdata/coupon11/output" + System.currentTimeMillis();
Job job = new Job();
job.setJarByClass(Coupon11LogJobMain.class);
FileInputFormat.addInputPaths(job, inputFile);
FileOutputFormat.setOutputPath(job, new Path(outDir));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setMapperClass(Coupon11LogMapper.class);
job.setReducerClass(Coupon11LogReducer.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}