如何用Python為Hadoop寫壹個簡單的MapReduce程序？

在他的博客裏，邁克爾格。Noll在Hadoop中提到了如何用Python寫MapReduce程序，韓國的gogamza也在他的Bolg中提到了如何用C寫MapReduce程序(我稍微修改了壹下原程序，因為他的Map使用了tab鍵進行分詞)。我合並了他們的文章，讓國內的Hadoop用戶可以用其他語言編寫MapReduce程序。首先，妳得裝備妳的Hadoop集群。網上有很多這方面的介紹。下面是鏈接(Hadoop學習筆記二安裝部署)。HadoopStreaming幫助我們在非Java編程語言中使用MapReduce，Streaming使用STDIN(標準輸入)和STDOUT(標準輸出)與我們編譯的Map和Reduce交換數據。任何會使用stdin和stdout的人都可以用來編寫MapReduce程序，比如Python中的sys.stdin和sys.stdout，或者c中的STDIN和STDOUT，我們還是用Hadoop WordCount的例子來演示如何編寫MapReduce。在WordCount的例子中，我們要解決的問題是計算壹批文檔中每個單詞的出現頻率。首先我們會在Map程序中接收到這批文檔的每壹行的數據，然後我們寫的Map程序會把這壹行按空格切割成壹個數組。並且用標準輸出“1”遍歷該數組，這意味著該單詞出現壹次。讓我們統計壹下Reduce中的詞頻。PythonCode Map:mapper.py #！/usr/bin/envpythonimportsys # mapswordstothercountswar 2 count = { } # inputcomesfromSTDIN(standard input)for line insys . stdin:# removeleadingandtrailingtwhitespace line = line . strip()# splitthelineintowords whilemovingamyptystringswords = filter(lambdaword:word，line . split())# increase countersforwords:# writerresultstostdout(standard output)；# whatweoutputherewillbetheinputfor # reduce step，即i.e.theinputforreducer.py # #制表符分隔；trivalwordcountis 1 print ' % s \ t % s ' %(word，1) Reduce:reducer.py #！/usr/bin/envpythonfroperatorimportititemgetterimportsys # mapswordstotheircountswar 2 count = { } # inputcomes from inforlineinsys . stdin:# removeleadingandtrailingtwhitespace line = line . strip()# parsetheinputwegotfromapper . py word，count = line . split()# convert count(currentlyastring)to intry:count = int(count)word 2 count[word]= word 2 count . get(word，0)+count exceptuvalueerror# # thisstepisNOTrequired，wejustdoitsothatour # final output willookmoreliketheofficial Hadoop # word count examples ssorded _ word 2 count = sorted(word 2 count . items()，key = item getter(0))# writeresultstostdout(standard output)for word，count in sorted _ word 2 count:print“% s \ t % s ' %(word，count)code Map:mapper . c # include # include # include # include # include # define BUF _ SIZE 2048 # defined elim " "while(fgets(buffer，BUF_SIZE-1，stdin)){ int len = strlen(buffer)；if(buffer[len-1]= = ' \ n ')buffer[len-1]= 0；char*querys=index(buffer，' ')；char * query = NULLif(querys==NULL)繼續；query+= 1；/* nottoinclude ' \ t ' */query = strtok(buffer，" ")；while(query){ printf(" % s \ t 1 \ n "，query)；query=strtok(NULL，" ")；} } return0} h & gth & gth & gth & gtreduce:c # include # include # include # include # define BUFFER _ SIZE 1024 # define delim " \ t " int main(intargc，char * argv[]){ charstrLastKey[BUFFER _ SIZE]；charstrLine[BUFFER _ SIZE]；int count = 0；* strLastKey = ' \ 0* strLine = ' \ 0while(fgets(strLine，BUFFER_SIZE-1，stdin)){ char * strCurrKey = NULL；char * strCurrNum = NULLstrCurrKey=strtok(strLine，DELIM)；strCurrNum=strtok(NULL，DELIM)；/*必要的檢查或但是。*/if(strLastKey[0]= = ' \ 0 '){ strcpy(strLastKey，strCurrKey)；}if(strcmp(strCurrKey，strLastKey)){printf("%s\t%d\n "，strLastKey，count)；count = atoi(strCurrNum)；} else { count+= atoi(strCurrNum)；}strcpy(strLastKey，strCurrKey)；}printf("%s\t%d\n "，strLastKey，count)；/* flushthecount */return 0；} h & gth & gth & gth & gt首先我們來調試壹下源代碼:chmod+xmapper . pychmod+x reducer . pye CHO " foofoquuxlabsfoobarquux " |。/mapper.py |。/reducer . pybar 1 foo 3 labs 1 quux2g ++ mapper . c-oMapperg ++ reducer . c-oReducerchmod+xMapperchmod+xReducerecho " fooquuxlabsfoobarquux " |。/Mapper|。/Reducer bar 1 foo 2 Labs 1 Quux 1 Quux 1妳可能看到C的輸出和Python不壹樣，因為Python把它放在字典裏了。當我們在Hadoop中時，我們將對此進行排序。然後在標準輸出中會連續輸出相同的單詞。要在Hadoop中運行程序，我們首先需要從我們測試文檔的wget頁面下載用php編寫的MapReduce程序，供php程序員參考:Map:mapper.php #！/usr/bin/PHP $ word 2 count = array()；//inputcomesfromsdin(standard input)while(($ line = fgets(STDIN))！= = false){//removeleadingandtrailingtwhitespace and lowercase $ line = strtolower(trim($ line))；//splitthelineintowords whileremovingayemptystring $ words = PREG _ SPLIT('/\ W/'，$line，0，PREG _ SPLIT _ NO _ EMPTY)；//increase countersforeach($ words as $ word){ $ word 2 count[$ word]+= 1；} }//writetheresultstoSTDOUT(standard output)//whatweoutputherewillbetheinputfor//reduce step，即theinputforreducer . pyforeach($ word 2 countas $ word = & gt；$count){//tab-delimited echo $ word，chr(9)，$ count，PHP _ EOL}?& gtReduce:mapper.php #！/usr/bin/PHP $ word 2 count = array()；//inputcomesfromSTDINwhile(($ line = fgets(STDIN))！= = false){//removeleadingandtrailing white space $ line = trim($ line)；//partheinputwegotfromapper . PHP list($ word，$count)=explode(chr(9)，$ line)；//convert count(currently astring)toint $ count = intval($ count)；//sumcountsif($ count & gt；0)$ word 2 count[$ word]+= $ count；}//sort thewordslexigraphical////thissetisinotrequired，wejustdoitsothatour//final outputwillookmoreliketheofficial Hadoop//word count examples kssort($ word 2 count)；//writeresultstostdout(standard output)foreach($ word 2 countas $ word = & gt；$count){echo$word，chr(9)，$count，PHP _ EOL}?& gt作者:馬士華發表於:2008年3月5月。