當前位置:成語大全網 - 書法字典 - 如何用Python為Hadoop寫壹個簡單的MapReduce程序?

如何用Python為Hadoop寫壹個簡單的MapReduce程序?

在他的博客裏,邁克爾格。Noll在Hadoop中提到了如何用Python寫MapReduce程序,韓國的gogamza也在他的Bolg中提到了如何用C寫MapReduce程序(我稍微修改了壹下原程序,因為他的Map使用了tab鍵進行分詞)。我合並了他們的文章,讓國內的Hadoop用戶可以用其他語言編寫MapReduce程序。首先,妳得裝備妳的Hadoop集群。網上有很多這方面的介紹。下面是鏈接(Hadoop學習筆記二安裝部署)。HadoopStreaming幫助我們在非Java編程語言中使用MapReduce,Streaming使用STDIN(標準輸入)和STDOUT(標準輸出)與我們編譯的Map和Reduce交換數據。任何會使用stdin和stdout的人都可以用來編寫MapReduce程序,比如Python中的sys.stdin和sys.stdout,或者c中的STDIN和STDOUT,我們還是用Hadoop WordCount的例子來演示如何編寫MapReduce。在WordCount的例子中,我們要解決的問題是計算壹批文檔中每個單詞的出現頻率。首先我們會在Map程序中接收到這批文檔的每壹行的數據,然後我們寫的Map程序會把這壹行按空格切割成壹個數組。並且用標準輸出“1”遍歷該數組,這意味著該單詞出現壹次。讓我們統計壹下Reduce中的詞頻。PythonCode Map:mapper.py #!/usr/bin/envpythonimportsys # mapswordstothercountswar 2 count = { } # inputcomesfromSTDIN(standard input)for line insys . stdin:# removeleadingandtrailingtwhitespace line = line . strip()# splitthelineintowords whilemovingamyptystringswords = filter(lambdaword:word,line . split())# increase countersforwords:# writerresultstostdout(standard output);# whatweoutputherewillbetheinputfor # reduce step,即i.e.theinputforreducer.py # #制表符分隔;trivalwordcountis 1 print ' % s \ t % s ' %(word,1) Reduce:reducer.py #!/usr/bin/envpythonfroperatorimportititemgetterimportsys # mapswordstotheircountswar 2 count = { } # inputcomes from inforlineinsys . stdin:# removeleadingandtrailingtwhitespace line = line . strip()# parsetheinputwegotfromapper . py word,count = line . split()# convert count(currentlyastring)to intry:count = int(count)word 2 count[word]= word 2 count . get(word,0)+count exceptuvalueerror# # thisstepisNOTrequired,wejustdoitsothatour # final output willookmoreliketheofficial Hadoop # word count examples ssorded _ word 2 count = sorted(word 2 count . items(),key = item getter(0))# writeresultstostdout(standard output)for word,count in sorted _ word 2 count:print“% s \ t % s ' %(word,count)code Map:mapper . c # include # include # include # include # include # define BUF _ SIZE 2048 # defined elim " "while(fgets(buffer,BUF_SIZE-1,stdin)){ int len = strlen(buffer);if(buffer[len-1]= = ' \ n ')buffer[len-1]= 0;char*querys=index(buffer,' ');char * query = NULLif(querys==NULL)繼續;query+= 1;/* nottoinclude ' \ t ' */query = strtok(buffer," ");while(query){ printf(" % s \ t 1 \ n ",query);query=strtok(NULL," ");} } return0} h & gth & gth & gth & gtreduce:c # include # include # include # include # define BUFFER _ SIZE 1024 # define delim " \ t " int main(intargc,char * argv[]){ charstrLastKey[BUFFER _ SIZE];charstrLine[BUFFER _ SIZE];int count = 0;* strLastKey = ' \ 0* strLine = ' \ 0while(fgets(strLine,BUFFER_SIZE-1,stdin)){ char * strCurrKey = NULL;char * strCurrNum = NULLstrCurrKey=strtok(strLine,DELIM);strCurrNum=strtok(NULL,DELIM);/*必要的檢查或但是。*/if(strLastKey[0]= = ' \ 0 '){ strcpy(strLastKey,strCurrKey);}if(strcmp(strCurrKey,strLastKey)){printf("%s\t%d\n ",strLastKey,count);count = atoi(strCurrNum);} else { count+= atoi(strCurrNum);}strcpy(strLastKey,strCurrKey);}printf("%s\t%d\n ",strLastKey,count);/* flushthecount */return 0;} h & gth & gth & gth & gt首先我們來調試壹下源代碼:chmod+xmapper . pychmod+x reducer . pye CHO " foofoquuxlabsfoobarquux " |。/mapper.py |。/reducer . pybar 1 foo 3 labs 1 quux2g ++ mapper . c-oMapperg ++ reducer . c-oReducerchmod+xMapperchmod+xReducerecho " fooquuxlabsfoobarquux " |。/Mapper|。/Reducer bar 1 foo 2 Labs 1 Quux 1 Quux 1妳可能看到C的輸出和Python不壹樣,因為Python把它放在字典裏了。當我們在Hadoop中時,我們將對此進行排序。然後在標準輸出中會連續輸出相同的單詞。要在Hadoop中運行程序,我們首先需要從我們測試文檔的wget頁面下載用php編寫的MapReduce程序,供php程序員參考:Map:mapper.php #!/usr/bin/PHP $ word 2 count = array();//inputcomesfromsdin(standard input)while(($ line = fgets(STDIN))!= = false){//removeleadingandtrailingtwhitespace and lowercase $ line = strtolower(trim($ line));//splitthelineintowords whileremovingayemptystring $ words = PREG _ SPLIT('/\ W/',$line,0,PREG _ SPLIT _ NO _ EMPTY);//increase countersforeach($ words as $ word){ $ word 2 count[$ word]+= 1;} }//writetheresultstoSTDOUT(standard output)//whatweoutputherewillbetheinputfor//reduce step,即theinputforreducer . pyforeach($ word 2 countas $ word = & gt;$count){//tab-delimited echo $ word,chr(9),$ count,PHP _ EOL}?& gtReduce:mapper.php #!/usr/bin/PHP $ word 2 count = array();//inputcomesfromSTDINwhile(($ line = fgets(STDIN))!= = false){//removeleadingandtrailing white space $ line = trim($ line);//partheinputwegotfromapper . PHP list($ word,$count)=explode(chr(9),$ line);//convert count(currently astring)toint $ count = intval($ count);//sumcountsif($ count & gt;0)$ word 2 count[$ word]+= $ count;}//sort thewordslexigraphical////thissetisinotrequired,wejustdoitsothatour//final outputwillookmoreliketheofficial Hadoop//word count examples kssort($ word 2 count);//writeresultstostdout(standard output)foreach($ word 2 countas $ word = & gt;$count){echo$word,chr(9),$count,PHP _ EOL}?& gt作者:馬士華發表於:2008年3月5月。