As we move towards an enterprise data analytics platform, I take every opportunity I can to come up with simple jobs in Hadoop, Hive, and Pig.
Below is one I ran in Pig that groups the top 50 URL requests without considering the query string.
[root@expressdb1 pig-0.11.1]# cat urls.pig
register './contrib/piggybank/java/piggybank.jar';
define DECODE org.apache.pig.piggybank.evaluation.decode.Decode();
p = load '/user/hive/warehouse/requests/localhost_access_log.2013-04-22.log.1' using PigStorage(' ') as (ip,username,time,tz,method,url:chararray,proto,status,size,ms);
f = limit p 10;
d = foreach p generate DECODE(INDEXOF(url,'?'),-1,url,SUBSTRING(url,0,INDEXOF(url,'?'))) as url;
g = group d by url;
cnt = foreach g generate group, COUNT(d) as c;
b = order cnt by c desc, group;
f = limit b 50;
dump f;
[root@expressdb1 pig-0.11.1]#
…and output of run…
[root@expressdb1 pig-0.11.1]# bin/pig -4 nolog.conf -f urls.pig
[root@expressdb1 pig-0.11.1]#