<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Dumbotics</title>
	<atom:link href="http://dumbotics.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://dumbotics.com</link>
	<description>Pseudo-random ramblings about Dumbo and Hadoop</description>
	<lastBuildDate>Sun, 15 Jan 2012 14:28:45 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='dumbotics.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Dumbotics</title>
		<link>http://dumbotics.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://dumbotics.com/osd.xml" title="Dumbotics" />
	<atom:link rel='hub' href='http://dumbotics.com/?pushpress=hub'/>
		<item>
		<title>Outputting Tokyo Cabinet or Constant DB files</title>
		<link>http://dumbotics.com/2011/04/29/tokyo-cabinet-or-constant-db/</link>
		<comments>http://dumbotics.com/2011/04/29/tokyo-cabinet-or-constant-db/#comments</comments>
		<pubDate>Fri, 29 Apr 2011 16:32:41 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Announcements]]></category>
		<category><![CDATA[constant db]]></category>
		<category><![CDATA[contributions]]></category>
		<category><![CDATA[feathers]]></category>
		<category><![CDATA[tokyo cabinet]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1386</guid>
		<description><![CDATA[Dumbo 0.21.30 got released this week. Apart from several bugfixes, it includes some cool new functionality that allows you to output Tokyo Cabinet or Constant DB files directly by using a special reducer in combination with the nifty output formats that got added to Feathers a while ago. Many thanks to Daniel Graña and Bruno [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1386&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="https://github.com/klbostee/dumbo/archives/release-0.21.30">Dumbo 0.21.30</a> got released this week. Apart from <a href="https://github.com/klbostee/dumbo/issues?labels=bug&amp;sort=created&amp;direction=desc&amp;state=closed&amp;page=1&amp;milestone=1">several bugfixes</a>, it includes some <a href="https://github.com/klbostee/dumbo/issues/32">cool new functionality</a> that allows you to output Tokyo Cabinet or Constant DB files directly by using a special reducer in combination with the <a href="https://github.com/klbostee/feathers/pull/3">nifty output</a> <a href="https://github.com/klbostee/feathers/pull/4">formats</a> that got added to <a href="http://github.com/klbostee/feathers">Feathers</a> a while ago. Many thanks to <a href="https://github.com/dangra">Daniel Graña</a> and <a href="https://github.com/brunovianarezende">Bruno Rezende</a> for contributing these awesome new features!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1386/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1386/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1386/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1386/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1386/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1386/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1386/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1386/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1386/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1386/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1386/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1386/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1386/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1386/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1386&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2011/04/29/tokyo-cabinet-or-constant-db/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>DAG jobs and mapredtest</title>
		<link>http://dumbotics.com/2010/12/17/dag-jobs-and-mapredtest/</link>
		<comments>http://dumbotics.com/2010/12/17/dag-jobs-and-mapredtest/#comments</comments>
		<pubDate>Fri, 17 Dec 2010 10:44:11 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Announcements]]></category>
		<category><![CDATA[contributions]]></category>
		<category><![CDATA[DAG]]></category>
		<category><![CDATA[mapredtest]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1368</guid>
		<description><![CDATA[Dumbo 0.21.29 went out the other day and it includes two exciting new features you might be interested in: Support for jobs that are DAGs instead of just chains, by David Chiang. A neat unit testing module inspired on Cloudera&#8217;s MRUnit, by Adam Ever-Hadani. It&#8217;s always great to get such high-quality contributions. Please keep them [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1368&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="https://github.com/klbostee/dumbo/archives/release-0.21.29">Dumbo 0.21.29</a> went out the other day and it includes two exciting new features you might be interested in:</p>
<ul>
<li>Support for <a href="https://github.com/klbostee/dumbo/issues/closed#issue/18">jobs that are DAGs</a> instead of just chains, by <a href="https://github.com/hydropyrum">David Chiang</a>.</li>
<li>A <a href="https://github.com/klbostee/dumbo/issues/closed#issue/21">neat unit testing module</a> inspired on Cloudera&#8217;s <a href="http://www.cloudera.com/blog/2009/07/debugging-mapreduce-programs-with-mrunit/">MRUnit</a>, by <a href="https://github.com/adamhadani">Adam Ever-Hadani</a>.</li>
</ul>
<p>It&#8217;s always great to get such high-quality contributions. Please keep them coming &ndash; I promise I&#8217;ll do everything I can to get them into my master branch, and eventually in a release, as quick as possible.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1368/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1368/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1368/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1368/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1368/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1368/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1368/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1368/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1368/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1368/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1368/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1368/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1368/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1368/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1368&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2010/12/17/dag-jobs-and-mapredtest/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>Dumbo backends</title>
		<link>http://dumbotics.com/2010/08/12/dumbo-backends/</link>
		<comments>http://dumbotics.com/2010/08/12/dumbo-backends/#comments</comments>
		<pubDate>Thu, 12 Aug 2010 13:21:51 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Explanations]]></category>
		<category><![CDATA[avro]]></category>
		<category><![CDATA[backends]]></category>
		<category><![CDATA[tether]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1354</guid>
		<description><![CDATA[I released Dumbo 0.21.26 the other day. As usual we fixed various bugs, but this release also incorporates an enhancement that makes it a bit more special, namely, some refactoring that can be regarded a first but important step towards plugable backends. Dumbo currently has two different backends, one that runs locally on UNIX and [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1354&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I released <a href="http://github.com/klbostee/dumbo/downloads">Dumbo 0.21.26</a> the other day. As usual we <a href="http://github.com/klbostee/dumbo/issues/closed">fixed various bugs</a>, but this release also incorporates an enhancement that makes it a bit more special, namely, <a href="http://github.com/klbostee/dumbo/issues/closed#issue/8">some refactoring</a> that can be regarded a first but important step towards plugable backends.</p>
<p>Dumbo currently has two different backends, one that runs locally on UNIX and another that runs on <a href="http://hadoop.apache.org/common/docs/r0.20.2/streaming.html">Hadoop Streaming</a>. The code for both of these backends used to be interwoven with the core Dumbo logic, but now we abstracted it away behind a proper backend interface which will hopefully make it easier to add more backends in the future.</p>
<p>Personally, I would very much like Dumbo to get a backend for <a href="http://svn.apache.org/viewvc/avro/trunk/lang/java/src/java/org/apache/avro/mapred/tether/">Avro Tether</a> at some point. The two main starting points for making this happen would probably be <a href="http://github.com/klbostee/dumbo/commit/535ae797ba53b86dad4bffa51d1838e9a1c04018">my main refactoring commit</a> and the <a href="http://svn.apache.org/viewvc/avro/trunk/lang/java/src/test/java/org/apache/avro/mapred/tether">Java implementation of a Tether client in the Avro unit tests</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1354/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1354/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1354/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1354/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1354/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1354/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1354/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1354/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1354/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1354/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1354/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1354/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1354/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1354/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1354&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2010/08/12/dumbo-backends/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>HUGUK #4</title>
		<link>http://dumbotics.com/2010/05/18/huguk-4/</link>
		<comments>http://dumbotics.com/2010/05/18/huguk-4/#comments</comments>
		<pubDate>Tue, 18 May 2010 11:27:05 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1328</guid>
		<description><![CDATA[In response to Johan&#8216;s desperate request I&#8217;ve decided to organize a 4th HUGUK meetup. More info will follow on the official HUGUK blog soon, but since it&#8217;s going to be fairly short notice I thought it made sense to already share some details now: Date: Thursday 3rd of June Time: 18.30 Place: new Skills Matter [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1328&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>In response to <a href="http://blog.oskarsson.nu">Johan</a>&#8216;s <a href="http://huguk.org/2010/05/future-huguk-meetups.html">desperate request</a> I&#8217;ve decided to organize a 4th HUGUK meetup. More info will follow on the <a href="http://huguk.org">official HUGUK blog</a> soon, but since it&#8217;s going to be fairly short notice I thought it made sense to already share some details now: </p>
<ul>
<li><u>Date:</u> Thursday 3rd of June</li>
<li><u>Time:</u> 18.30</li>
<li><u>Place:</u> <a href="http://skillsmatter.com/go/find-us">new Skills Matter building</a></li>
</ul>
<p>The two main talks will be:</p>
<p><strong>“Introduction to Sqoop” by Aaron Kimball</strong></p>
<p><i>&#8211; Synopsis &#8211;</i></p>
<blockquote><p>
This talk introduces Sqoop, the open source SQL-to-Hadoop tool. Sqoop helps users perform efficient imports of data from RDBMS sources to Hadoop&#8217;s distributed file system, where it can be processed in concert with other data sources. Sqoop also allows users to export Hadoop-generated results back to an RDBMS for use with other data pipelines.</p>
<p>After this session, users will understand how databases and Hadoop fit together, and how to use Sqoop to move data between these systems. The talk will provide suggestions for best practices when integrating Sqoop and Hadoop in your data processing pipelines. We&#8217;ll also cover some deeper technical details of Sqoop&#8217;s architecture, and take a look at some upcoming aspects of Sqoop&#8217;s development roadmap.
</p></blockquote>
<p><i>&#8211; Bio &#8211;</i></p>
<blockquote><p>
Aaron Kimball has been working with Hadoop since early 2007. Aaron has worked with the NSF and several other universities nationally and internationally to advance education in the field of large-scale data-intensive computing. He helped create and deliver academic course materials first used at the University of Washington (and later adopted by many other academic institutions) as well as Hadoop training materials used by several industry partners. Aaron has also worked as an independent consultant focusing on Hadoop and Amazon EC2-based systems. At Cloudera, he continues to actively develop Hadoop and related tools, as well as focus on training and user education. Aaron holds a B.S. in Computer Science from Cornell University, and an M.S. in Computer Science and Engineering from the University of Washington.
</p></blockquote>
<p><strong>&#8220;Hive at Last.fm&#8221; by Tim Sell</strong></p>
<p><i>&#8211; Synopsis &#8211;</i></p>
<blockquote><p>
This talk is about using Hive in practice. We will go through some of the specific use cases for which Hive is currently being used at Last.fm, highlighting its strengths and weaknesses along the way.
</p></blockquote>
<p><i>&#8211; Bio &#8211;</i></p>
<blockquote><p>
Tim Sell is a Data Engineer at Last.fm who works with Hive and Hadoop on a daily basis.
</p></blockquote>
<p>As usual we&#8217;ll try to provide some free beer at the end and anyone is welcome to give a short lightning talk after the main presentations.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1328/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1328/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1328/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1328/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1328/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1328/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1328/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1328/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1328/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1328/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1328/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1328/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1328/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1328/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1328&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2010/05/18/huguk-4/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>Dumbo at PyCon</title>
		<link>http://dumbotics.com/2010/02/22/dumbo-at-pycon/</link>
		<comments>http://dumbotics.com/2010/02/22/dumbo-at-pycon/#comments</comments>
		<pubDate>Mon, 22 Feb 2010 10:16:12 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Examples]]></category>
		<category><![CDATA[amazon]]></category>
		<category><![CDATA[ec2]]></category>
		<category><![CDATA[nltk]]></category>
		<category><![CDATA[pycon]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1315</guid>
		<description><![CDATA[Nitin Madnani gave a talk at PyCon this weekend about how Dumbo and Amazon EC2 allowed him to process very large text corpora using the machinery provided by NLTK. Unfortunately I wasn&#8217;t there but I heard that his talk was very well received, and his slides definitely are pretty awesome.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1315&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Nitin Madnani gave a <a href="http://us.pycon.org/2010/conference/schedule/event/98/">talk</a> at <a href="http://us.pycon.org/2010">PyCon</a> this weekend about how <a href="http://last.fm/dumbo">Dumbo</a> and <a href="http://aws.amazon.com/ec2/">Amazon EC2</a> allowed him to process very large text corpora using the machinery provided by <a href="http://www.nltk.org/">NLTK</a>. Unfortunately I wasn&#8217;t there but I heard that his talk was very well received, and <a href="http://us.pycon.org/media/2010/talkdata/PyCon2010/098/large-scale-nlp-pycon-2010.pdf">his slides</a> definitely are pretty awesome.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1315/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1315/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1315/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1315/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1315/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1315/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1315/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1315/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1315/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1315/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1315/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1315/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1315/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1315/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1315&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2010/02/22/dumbo-at-pycon/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>Consuming Dumbo output with Pig</title>
		<link>http://dumbotics.com/2010/02/05/consuming-dumbo-output-with-pig/</link>
		<comments>http://dumbotics.com/2010/02/05/consuming-dumbo-output-with-pig/#comments</comments>
		<pubDate>Fri, 05 Feb 2010 10:39:51 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Examples]]></category>
		<category><![CDATA[Tips and tricks]]></category>
		<category><![CDATA[pig]]></category>
		<category><![CDATA[pigtail]]></category>
		<category><![CDATA[typed bytes]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1295</guid>
		<description><![CDATA[Although it abstracts and simplifies it all quite a bit, Dumbo still forces you to think in MapReduce, which might not be ideal if you want to implement complex data flows in a limited amount of time. Personally, I think that Dumbo still occupies a useful space within the Hadoop ecosystem, but in some cases [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1295&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Although it abstracts and simplifies it all quite a bit, Dumbo still forces you to think in MapReduce, which might not be ideal if you want to implement complex data flows in a limited amount of time. Personally, I think that Dumbo still occupies a useful space within the Hadoop ecosystem, but in some cases it makes sense to work at an even higher level and use something like <a href="http://hadoop.apache.org/pig">Pig</a> or <a href="http://hadoop.apache.org/hive">Hive</a>. In fact, sometimes it makes sense to combine the two and do some parts of your data flow in Dumbo and others in Pig. To make this possible, I recently wrote a Pig <a href="http://github.com/klbostee/pigtail/blob/master/src/main/java/fm/last/pigtail/storage/TypedBytesSequenceFileLoader.java">loader function for sequence files that contain <tt>TypedBytesWritable</tt>s</a>, which is the file format Dumbo uses by default to store all its output on Hadoop. Here&#8217;s an example of a Pig script that reads Dumbo output:</p>
<blockquote><pre>
register pigtail.jar;  -- http://github.com/klbostee/pigtail

a = load '/hdfs/path/to/dumbo/output'
    using fm.last.pigtail.storage.TypedBytesSequenceFileLoader()
    as (artist:int, val:(listeners:int, listens:int));
b = foreach a generate artist, val.listeners as listeners;
c = order b by listeners;
d = limit c 100;

dump d;
</pre>
</blockquote>
<p>You basically just have to specify names and types for the components of the key/value pairs and you&#8217;re good to go.</p>
<p>A possibly useful side-effect of writing this loader is the ability it creates of reading all sorts of file formats with Pig. Everything that Dumbo can read can also be consumed by Pig scripts now, all you have to do is write a simple Dumbo script that converts it to typed bytes sequence files:</p>
<blockquote><pre>
from dumbo import run
from dumbo.lib import identitymapper

if __name__ == "__main__":
    run(identitymapper)
</pre>
</blockquote>
<p>The proper solution is of course to write custom Pig loaders, but this gets the job done too and doesn&#8217;t slow things down that much.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1295/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1295/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1295/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1295/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1295/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1295/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1295/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1295/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1295/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1295/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1295/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1295/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1295/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1295/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1295&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2010/02/05/consuming-dumbo-output-with-pig/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>Reading Hadoop records in Python</title>
		<link>http://dumbotics.com/2009/12/23/reading-hadoop-records-in-python/</link>
		<comments>http://dumbotics.com/2009/12/23/reading-hadoop-records-in-python/#comments</comments>
		<pubDate>Wed, 23 Dec 2009 20:32:45 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Tips and tricks]]></category>
		<category><![CDATA[ctypedbytes]]></category>
		<category><![CDATA[hadoop records]]></category>
		<category><![CDATA[hadoop_record]]></category>
		<category><![CDATA[jute]]></category>
		<category><![CDATA[recordio]]></category>
		<category><![CDATA[typedbytes]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1272</guid>
		<description><![CDATA[At the 11/18 Bay Area HUG, Paul Tarjan apparently presented an approach for reading Hadoop records in Python. In summary, his approach seems to work as follows: Hadoop records &#160;&#160;&#160;&#160; &#8594; CsvRecordInput &#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; &#8594; hadoop_record Python module Although it&#8217;s a nice and very systematic solution, I couldn&#8217;t resist blogging about an already existing alternative solution [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1272&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>At the <a href="http://developer.yahoo.net/blogs/hadoop/2009/11/1118_hadoop_bay_area_user_grou.html">11/18 Bay Area HUG</a>, <a href="http://blog.paulisageek.com/">Paul Tarjan</a> apparently <a href="http://www.slideshare.net/hadoopusergroup/hadoop-record-reader-in-python-2635453?src=embed">presented</a> an approach for reading <a href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/record/package-summary.html">Hadoop records</a> in Python. In summary, his approach seems to work as follows:</p>
<blockquote><p>
Hadoop records<br />
&nbsp;&nbsp;&nbsp;&nbsp; &rarr; <a href="http://svn.apache.org/viewvc/hadoop/common/trunk/src/java/org/apache/hadoop/record/CsvRecordInput.java?view=markup"><code>CsvRecordInput</code></a><br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &rarr; <a href="http://github.com/ptarjan/hadoop_record"><code>hadoop_record</code> Python module</a>
</p></blockquote>
<p>Although it&#8217;s a nice and very systematic solution, I couldn&#8217;t resist blogging about an already existing alternative solution for this problem: </p>
<blockquote><p>
Hadoop records<br />
&nbsp;&nbsp;&nbsp;&nbsp; &rarr; <a href="http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/typedbytes/TypedBytesRecordInput.java?view=markup"><code>TypedBytesRecordInput</code></a><br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &rarr; <a href="http://github.com/klbostee/typedbytes"><code>typedbytes</code> Python module</a>
</p></blockquote>
<p>Not only would this have saved Paul a lot of work, it probably also would&#8217;ve been more efficient, especially when using <a href="http://github.com/klbostee/ctypedbytes">ctypedbytes</a>, the speedy variant of the <a href="http://github.com/klbostee/typedbytes">typedbytes module</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1272/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1272/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1272/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1272/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1272/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1272/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1272/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1272/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1272/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1272/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1272/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1272/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1272/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1272/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1272&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2009/12/23/reading-hadoop-records-in-python/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>Dumbo on Amazon EMR</title>
		<link>http://dumbotics.com/2009/12/23/dumbo-on-amazon-emr/</link>
		<comments>http://dumbotics.com/2009/12/23/dumbo-on-amazon-emr/#comments</comments>
		<pubDate>Wed, 23 Dec 2009 09:24:56 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Examples]]></category>
		<category><![CDATA[Tips and tricks]]></category>
		<category><![CDATA[amazon]]></category>
		<category><![CDATA[ec2]]></category>
		<category><![CDATA[elastic mapreduce]]></category>
		<category><![CDATA[mapreduce-1293]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1253</guid>
		<description><![CDATA[A while ago, I received an email from Andrew in which he wrote: Now you should be able to run Dumbo jobs on Elastic MapReduce. To start a cluster, you can use the Ruby client as so: $ elastic-mapreduce --create --alive SSH into the cluster using your EC2 keypair as user hadoop and install Dumbo [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1253&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>A while ago, I received an email from <a href="http://andrewhitchcock.org/">Andrew</a> in which he wrote:</p>
<blockquote><p>
Now you should be able to run Dumbo jobs on <a href="http://aws.amazon.com/elasticmapreduce/">Elastic MapReduce</a>. To start a cluster, you can use the Ruby client as so:</p>
<p><code>$ elastic-mapreduce --create --alive</code></p>
<p>SSH into the cluster using your <a href="http://aws.amazon.com/ec2/">EC2</a> keypair as user <code>hadoop</code> and install Dumbo with the following two commands:</p>
<p><code>$ wget -O ez_setup.py http://bit.ly/ezsetup</code><br />
<code>$ sudo python ez_setup.py dumbo</code></p>
<p>Then you can run your Dumbo scripts. I was able to run the <code>ipcount.py</code> demo with the following command.</p>
<p><code>$ dumbo start ipcount.py -hadoop /home/hadoop \<br />
-input s3://anhi-test-data/wordcount/input/ \<br />
-output s3://anhi-test-data/output/dumbo/wc/</code></p>
<p>The <code>-hadoop</code> option is important. At this point I haven&#8217;t created an automatic Dumbo install script, so you&#8217;ll have to install Dumbo by hand each time you launch the cluster. Fortunately installation is easy.
</p></blockquote>
<p>There was a <a href="http://groups.google.com/group/dumbo-user/msg/70c910f1250b1d63">minor hiccup</a> that required the Amazon guys to pull the AMI with Dumbo support, but it&#8217;s back now and they seem to be confident that Dumbo support is going to remain available from now on. They are also still planning to make things even easier by providing an automatic Dumbo installation script.</p>
<p>As an aside, it&#8217;s worth mentioning that a bug in Hadoop Streaming <a href="http://issues.apache.org/jira/browse/MAPREDUCE-1293">got fixed</a> in the process of adding Dumbo support to EMR. I can&#8217;t wait to see what else the Amazon guys have up their sleeves.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1253/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1253/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1253/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1253/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1253/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1253/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1253/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1253/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1253/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1253/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1253/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1253/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1253/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1253/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1253&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2009/12/23/dumbo-on-amazon-emr/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>Moving to Hadoop 0.20</title>
		<link>http://dumbotics.com/2009/11/23/moving-to-hadoop-0-20/</link>
		<comments>http://dumbotics.com/2009/11/23/moving-to-hadoop-0-20/#comments</comments>
		<pubDate>Mon, 23 Nov 2009 09:26:29 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Explanations]]></category>
		<category><![CDATA[Tips and tricks]]></category>
		<category><![CDATA[cloudera]]></category>
		<category><![CDATA[hadoop 0.20]]></category>
		<category><![CDATA[hadoop-gpl-compression]]></category>
		<category><![CDATA[hadoop-lzo]]></category>
		<category><![CDATA[mapreduce-764]]></category>
		<category><![CDATA[mapreduce-967]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1233</guid>
		<description><![CDATA[We&#8217;ve finally started looking into moving from Hadoop 0.18 to 0.20 at Last.fm, and I thought it might be useful to share a few Dumbo-related things I learned in the process: We&#8217;re probably going to base our 0.20 build on Cloudera&#8216;s 0.20 distribution, and I found out the hard way that Dumbo doesn&#8217;t work on [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1233&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>We&#8217;ve finally started looking into moving from Hadoop 0.18 to 0.20 at <a href="http://last.fm">Last.fm</a>, and I thought it might be useful to share a few Dumbo-related things I learned in the process:</p>
<ul>
<li>We&#8217;re probably going to base our 0.20 build on <a href="http://cloudera.com">Cloudera</a>&#8216;s <a href="http://archive.cloudera.com/cdh/testing/">0.20 distribution</a>, and I found out the hard way that Dumbo doesn&#8217;t work on version 0.20.1+133 of this distribution because it includes a patch for <a href="http://issues.apache.org/jira/browse/MAPREDUCE-967">MAPREDUCE-967</a> that <a href="http://issues.apache.org/jira/browse/MAPREDUCE-967?focusedCommentId=12770121&amp;page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#action_12770121">breaks</a> some of the Hadoop Streaming functionality on which Dumbo relies. Luckily, the Cloudera guys fixed it in 0.20.1+152 by reverting this patch, but if you&#8217;re still trying to get Dumbo to work on Cloudera&#8217;s 0.20.1+133 distribution for some reason then you can expect to get NullPointerExceptions and errors like, e.g., &#8220;module wordcount not found&#8221; in your tasks&#8217; stderr logs.</li>
<li>Also, the Cloudera guys apparently haven&#8217;t added the patch for <a href="http://issues.apache.org/jira/browse/MAPREDUCE-764">MAPREDUCE-764</a> to their distribution yet, so you&#8217;ll still have to apply this patch yourself if you want to avoid <a href="http://dumbotics.com/2009/07/15/mapreduce-764/">strange encoding problems</a> in certain corner cases. This patch has now been reviewed and accepted for Hadoop 0.21 for quite a while already though, so maybe we can be hopeful about it getting included in Cloudera&#8217;s 0.20 distribution soon.</li>
<li>The <a href="http://twitter.com">Twitter</a> guys put together a pretty awesome <a href="http://github.com/kevinweil/hadoop-lzo">patched and backported version</a> of <a href="http://code.google.com/p/hadoop-gpl-compression/">hadoop-gpl-compression</a> for Hadoop 0.20. It includes several bugfixes and it also provides an InputFormat for the old API, which is useful for Hadoop Streaming (and hence also Dumbo) users since Streaming has not been converted to the new API yet. If you&#8217;re interested in this stuff, you might want to have a look at <a href="http://www.cloudera.com/blog/2009/11/17/hadoop-at-twitter-part-1-splittable-lzo-compression/">this</a> guest post from <a href="http://twitter.com/kevinWeil">Kevin</a> and <a href="http://twitter.com/emaland">Eric</a> on the Cloudera blog.</li>
</ul>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1233/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1233/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1233/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1233/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1233/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1233/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1233/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1233/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1233/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1233/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1233/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1233/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1233/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1233/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1233&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2009/11/23/moving-to-hadoop-0-20/feed/</wfw:commentRss>
		<slash:comments>7</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
		<item>
		<title>Dumbo over HBase</title>
		<link>http://dumbotics.com/2009/07/31/dumbo-over-hbase/</link>
		<comments>http://dumbotics.com/2009/07/31/dumbo-over-hbase/#comments</comments>
		<pubDate>Fri, 31 Jul 2009 13:46:37 +0000</pubDate>
		<dc:creator>Klaas</dc:creator>
				<category><![CDATA[Examples]]></category>
		<category><![CDATA[Tips and tricks]]></category>
		<category><![CDATA[hbase]]></category>
		<category><![CDATA[inputformat]]></category>
		<category><![CDATA[outputformat]]></category>

		<guid isPermaLink="false">http://dumbotics.com/?p=1205</guid>
		<description><![CDATA[This should be old news for dumbo-user subscribers, but Tim has, once again, put his Java coding skills to good use. This time around he created nifty input and output formats for consuming and/or producing HBase tables from Dumbo programs. Here&#8217;s a silly but illustrative example: from dumbo import opt, run @opt("inputformat", "fm.last.hbase.mapred.TypedBytesTableInputFormat") @opt("hadoopconf", "hbase.mapred.tablecolumns=testfamily:testqualifier") [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1205&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>This should be <a href="http://groups.google.com/group/dumbo-user/browse_thread/thread/fb74b3be4600e85b">old news</a> for <a href="http://groups.google.com/group/dumbo-user">dumbo-user</a> subscribers, but <a href="http://nectarius.net/">Tim</a> has, <a href="http://dumbotics.com/2009/05/06/binarypartitioner-backported-to-018/">once again</a>, put his Java coding skills to <a href="http://twitter.com/roserpens/statuses/2891376424">good use</a>. This time around he created nifty <a href="http://github.com/tims/lasthbase/blob/81c3b3410f0609c7a899d462d27ce18597ccffea/src/java/fm/last/hbase/mapred/TypedBytesTableInputFormat.java">input</a> and <a href="http://github.com/tims/lasthbase/blob/81c3b3410f0609c7a899d462d27ce18597ccffea/src/java/fm/last/hbase/mapred/TypedBytesTableOutputFormat.java">output</a> formats for consuming and/or producing <a href="http://hadoop.apache.org/hbase/">HBase</a> tables from Dumbo programs. Here&#8217;s a silly but illustrative example: </p>
<blockquote><pre>
from dumbo import opt, run

@opt("inputformat", "fm.last.hbase.mapred.TypedBytesTableInputFormat")
@opt("hadoopconf", "hbase.mapred.tablecolumns=testfamily:testqualifier")
def mapper(key, columns):
    for family, column in columns.iteritems():
        for qualifier, value in column.iteritems():
            yield key, (family, qualifier, value)

@opt("outputformat", "fm.last.hbase.mapred.TypedBytesTableOutputFormat")
@opt("hadoopconf", "hbase.mapred.outputtable=output_table")
def reducer(key, values):
    columns = {}
    for family, qualifier, value in values:
        column = columns.get(family, {})
        column[qualifier] = value
    yield key, columns

if __name__ == "__main__":
    run(mapper, reducer)
</pre>
</blockquote>
<p>Have a look at the <a href="http://github.com/tims/lasthbase/blob/81c3b3410f0609c7a899d462d27ce18597ccffea/README.txt">readme</a> for more information.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/dumbotics.wordpress.com/1205/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/dumbotics.wordpress.com/1205/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/dumbotics.wordpress.com/1205/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/dumbotics.wordpress.com/1205/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/dumbotics.wordpress.com/1205/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/dumbotics.wordpress.com/1205/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/dumbotics.wordpress.com/1205/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/dumbotics.wordpress.com/1205/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/dumbotics.wordpress.com/1205/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/dumbotics.wordpress.com/1205/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/dumbotics.wordpress.com/1205/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/dumbotics.wordpress.com/1205/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/dumbotics.wordpress.com/1205/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/dumbotics.wordpress.com/1205/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=dumbotics.com&amp;blog=6701349&amp;post=1205&amp;subd=dumbotics&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://dumbotics.com/2009/07/31/dumbo-over-hbase/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Klaas</media:title>
		</media:content>
	</item>
	</channel>
</rss>
