We recently wanted to have a rough idea of how much data our Hadoop clusters process on a daily basis. Here’s the Dumbo program I used to obtain this information:
from datetime import date
class Mapper:
def __init__(self):
from re import compile
self.numbers = []
self.numbers.append(compile("HDFS bytes read:([0-9]+)"))
self.numbers.append(compile("Local bytes read:([0-9]+)"))
self.finish = compile('FINISH_TIME="([^"]+)"')
def __call__(self, key, value):
if value.startswith("Job") and "COUNTERS" in value:
gb = 0 # gigabytes processed
for number in self.numbers:
mo = number.search(value)
if mo: gb += int(round(float(mo.group(1)) / 2**30))
ts = float(self.finish.search(value).group(1)) / 1000
datetuple = date.fromtimestamp(ts).timetuple()[:3]
yield datetuple, gb
if __name__ == "__main__":
from dumbo import run, sumreducer
run(Mapper, sumreducer, combiner=sumreducer)
Running this on the job logs for one of our clusters (which are gathered by the shell script discussed in this previous post) led to the following graph:
This graph clearly shows why some of us get annoyed sometimes when they want to explore data on this cluster on certain days of the week or month…
