Pig script to group URL requests in JBOSS

As we move towards an enterprise data analytics platform, I take every opportunity I can to come up with simple jobs in Hadoop, Hive, and Pig.

Below is one I ran in Pig that groups the top 50 URL requests without considering the query string.

Script…

[root@expressdb1 pig-0.11.1]# cat urls.pig
register './contrib/piggybank/java/piggybank.jar';
define DECODE org.apache.pig.piggybank.evaluation.decode.Decode();
p = load '/user/hive/warehouse/requests/localhost_access_log.2013-04-22.log.1' using PigStorage(' ') as (ip,username,time,tz,method,url:chararray,proto,status,size,ms);
f = limit p 10;
d = foreach p generate DECODE(INDEXOF(url,'?'),-1,url,SUBSTRING(url,0,INDEXOF(url,'?'))) as url;
g = group d by url;
cnt = foreach g generate group, COUNT(d) as c;
b = order cnt by c desc, group;
f = limit b 50;
dump f;
[root@expressdb1 pig-0.11.1]#

…and output of run…

[root@expressdb1 pig-0.11.1]# bin/pig -4 nolog.conf -f urls.pig
(/checkout/gadgets/minicartcontents.jsp,159267)
(/includes/header_tools.jsp,159221)
(/static/js/s_code_exp.jsp,146266)
(/catalog/gadgets/recently_viewed_items.jsp,102142)
(/static/js/refinements.js,79580)
(/catalog/gadgets/productList_filter.jsp,78103)
(/akamai/akamai-sureroute-test-object.htm,62116)
(/catalog/gadgets/color_size_gadget.jsp,60576)
(/mobile/includes/mobile_header_tools.jsp,44725)
(/mobile/static/js/s_code_exp.jsp,44064)
(/user/login.jsp,41088)
(/,37225)
(/static/js/zoomer.js,35368)
(/catalog/gadgets/zoomerDroplet.jsp,27592)
(/mobile/catalog/gadgets/categoryProductList.jsp,20261)
(/mobile/catalog/gadgets/product_details_color_size_gadget.jsp,18528)
(/favicon.ico,14687)
(/exp-mobile-favicon.png,12458)
(/checkout/basket.jsp,8567)
(/catalog/gadgets/express_view.jsp,8130)
(/common/hp_subscribe.jsp,8117)
(/static/js/expressView.js,7607)
(/mobile/,6598)
(/mobile/content.jsp,6409)
(/catalog/actions/cart-submit.jsp,5640)
(/includes/shoppingCartItemCount.jsp,5503)
(/catalog/urls/cart-submit-success.jsp,5179)
(/catalog/product_detail.jsp,4406)
(/search/search.jsp,4390)
(/content.jsp,4228)
(/mobile/images/linked-arrow.png,3507)
(/mobile/bestselling_background.jpg,3008)
(/checkout/checkout.jsp,2676)
(/mobile/linked-arrow.png,2262)
(/catalog/search_results.jsp,2174)
(/catalog/gadgets/fs_color_size_gadget.jsp,1969)
(/mobile/catalog/search_results.jsp,1925)
(/mobile/exp-mobile-favicon.png,1897)
(/mobile/includes/shoppingCartItemCount.jsp,1796)
(/user/overview.jsp,1719)
(/mobile/static/img/backgrounds/listArrow.png,1660)
(/catalog/search.cmd,1600)
(/mobile/favicon.ico,1585)
(/mobile/checkout/basket.jsp,1529)
(/mobile/catalog/category_listing.jsp,1442)
(/checkout/gadgets/removeItem.jsp,1429)
(/health.jsp,1416)
(/clothing/Women/sec/womenCategory,1406)
(/catalog/category_listing.jsp,1309)
(/checkout/,1306)
[root@expressdb1 pig-0.11.1]#

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.