在Google这个由10的100次方得名的站点中,各种评估网站的算法层出不穷,而PageRank即是其中之一。
Google的PageRank根据网站的外部链接和内部链接的数量和质量俩衡量网站的价值。PageRank背后的概念是,每个到页面的链接都是对该页面的一次投票,被链接的越多,就意味着被其他网站投票越多。这个就是所谓的“链接流行度”——衡量多少人愿意将他们的网站和你的网站挂钩。PageRank这个概念引自学术中一篇论文的被引述的频度——即被别人引述的次数越多,一般判断这篇论文的权威性就越高。
通常情况下讲,原创内容越多的站点,PageRank越容易提升,反之则相对比较困难,PageRank最大上限值为10。在Google的评估中,能上10的网站真可谓凤毛麟角,即使算上Google,能成就PageRank 10这“伟业”者,望眼环球也不足40家。一般来说,个人站点评估值4即办的不错,商业网站到6以上便算步入正轨了。
网上虽然有不少现成的查询器及源码,但是光用别人的毕竟不符合程序员风格,所以今天自己用Java重造轮子又写了个PageRank查询实现,捎带着把一些常用搜索引擎的网站链接及反向链接查询也加上了。
源码如下:
GooglePageRank.java
- package org.loon.test;
- import java.io.IOException;
- import java.util.Random;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- public class GooglePageRank{
- //googlepagerank服务器ip地址列表(最近google小气了很多,反复查询一个封ip)
- final static String[]GoogleServiceIP= new String[]{ "64.233.161.100" ,
- "64.233.161.101" , "64.233.183.91" , "64.233.189.44" , "66.102.1.103" ,
- "66.102.9.115" , "66.249.89.83" , "66.249.91.99" , "66.249.93.190" };
- //google用识别标记
- final static private int GOOGLE_MAGIC= 0xE6359A60 ;
- //ch数值混合器
- private class CHMix{
- int a;
- int b;
- int c;
- public CHMix(){
- this ( 0 , 0 , 0 );
- }
- public CHMix( int a, int b, int c){
- this .a=a;
- this .b=b;
- this .c=c;
- }
- }
- /**
- *按google要求混合成ch数据
- *
- *@parammix
- */
- private static void mix( final CHMixmix){
- mix.a-=mix.b;
- mix.a-=mix.c;
- mix.a^=mix.c>> 13 ;
- mix.b-=mix.c;
- mix.b-=mix.a;
- mix.b^=mix.a<< 8 ;
- mix.c-=mix.a;
- mix.c-=mix.b;
- mix.c^=mix.b>> 13 ;
- mix.a-=mix.b;
- mix.a-=mix.c;
- mix.a^=mix.c>> 12 ;
- mix.b-=mix.c;
- mix.b-=mix.a;
- mix.b^=mix.a<< 16 ;
- mix.c-=mix.a;
- mix.c-=mix.b;
- mix.c^=mix.b>> 5 ;
- mix.a-=mix.b;
- mix.a-=mix.c;
- mix.a^=mix.c>> 3 ;
- mix.b-=mix.c;
- mix.b-=mix.a;
- mix.b^=mix.a<< 10 ;
- mix.c-=mix.a;
- mix.c-=mix.b;
- mix.c^=mix.b>> 15 ;
- }
- /**
- *获得ch数值混合器
- *
- *@return
- */
- public static CHMixgetInnerCHMix(){
- return new GooglePageRank(). new CHMix();
- }
- /**
- *通过url获得googlech(google数据库针对页面的全球唯一标识)
- *
- *@paramurl
- *@return
- */
- public static StringGoogleCH( final Stringurl){
- //格式化为google要求的info:url模式
- StringnUrl=String.format( "info:%s" , new Object[]{url});
- //获得新url字符串格式
- char []urls=nUrl.toCharArray();
- //获得新url长度
- int length=urls.length;
- //获得一个ch数值混合器
- CHMixchMix=GooglePageRank.getInnerCHMix();
- //为c注入google识别标识
- chMix.c=GOOGLE_MAGIC;
- //为a、b项注入google要求的初始标识
- chMix.a=chMix.b= 0x9E3779B9 ;
- int k= 0 ;
- int len=length;
- while (len>= 12 ){
- chMix.a+=( int )(urls[k+ 0 ]+(urls[k+ 1 ]<< 8 )
- +(urls[k+ 2 ]<< 16 )+(urls[k+ 3 ]<< 24 ));
- chMix.b+=( int )(urls[k+ 4 ]+(urls[k+ 5 ]<< 8 )
- +(urls[k+ 6 ]<< 16 )+(urls[k+ 7 ]<< 24 ));
- chMix.c+=( int )(urls[k+ 8 ]+(urls[k+ 9 ]<< 8 )
- +(urls[k+ 10 ]<< 16 )+(urls[k+ 11 ]<< 24 ));
- //获得混合运算后的数据
- GooglePageRank.mix(chMix);
- k+= 12 ;
- len-= 12 ;
- }
- chMix.c+=length;
- //产生googlech的11位标识
- switch (len){
- case 11 :
- chMix.c+=( int )(urls[k+ 10 ]<< 24 );
- case 10 :
- chMix.c+=( int )(urls[k+ 9 ]<< 16 );
- case 9 :
- chMix.c+=( int )(urls[k+ 8 ]<< 8 );
- case 8 :
- chMix.b+=( int )(urls[k+ 7 ]<< 24 );
- case 7 :
- chMix.b+=( int )(urls[k+ 6 ]<< 16 );
- case 6 :
- chMix.b+=( int )(urls[k+ 5 ]<< 8 );
- case 5 :
- chMix.b+=( int )(urls[k+ 4 ]);
- case 4 :
- chMix.a+=( int )(urls[k+ 3 ]<< 24 );
- case 3 :
- chMix.a+=( int )(urls[k+ 2 ]<< 16 );
- case 2 :
- chMix.a+=( int )(urls[k+ 1 ]<< 8 );
- case 1 :
- chMix.a+=( int )(urls[k+ 0 ]);
- break ;
- default :
- break ;
- }
- //获得混合运算后的数据
- GooglePageRank.mix(chMix);
- //获得未修订的CH
- Stringtch=String.valueOf(chMix.c);
- //矫正差值后反馈正确CH
- return String
- .format( "6%s" , new Object[]{tch.length()< 10 ?( "-" +tch)
- .intern():tch});
- }
- /**
- *正则匹配pagerank结果
- *
- *@paramvalue
- *@return
- */
- private static StringMatchRank( final Stringvalue){
- Patternpattern=Pattern.compile( "Rank_1:[0-9]:([0-9]+)" );
- Matchermatcher=pattern.matcher(value);
- if (matcher.find()){
- return matcher.group( 1 );
- }
- return "0" ;
- }
- /**
- *获得指定页面的googlepagerank值
- *
- *@paramurl
- *@return
- */
- public static StringGooglePR( final Stringurl){
- Stringrip=GoogleServiceIP[ new Random()
- .nextInt(GoogleServiceIP.length)];
- return GooglePR(url,rip);
- }
- /**
- *以指定的google服务器获得指定页面的googlepagerank值
- *
- *@paramurl
- *@paramip
- *@return
- */
- public static StringGooglePR( final Stringurl, final Stringip){
- //产生查询用唯一标识
- Stringchecksum=GoogleCH(url);
- //产生查询用url
- StringqueryUrl=String
- .format(
- "http://%s/search?client=navclient-auto&ch=%s&features=Rank&q=info:%s" ,
- new Object[]{ip,checksum,url});
- Stringresponse;
- try {
- response=SimpleWebClient.getRequestHttp(queryUrl);
- } catch (IOExceptione){
- response= "" ;
- }
- if (response.length()== 0 ){
- return "0" ;
- } else {
- return GooglePageRank.MatchRank(response);
- }
- }
- }
SimpleWebClient.java
- package org.loon.test;
- import java.io.BufferedInputStream;
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.Map;
- import java.util.Set;
- import java.util.Map.Entry;
- import sun.misc.BASE64Encoder;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- public class SimpleWebClient{
- /**
- *向指定url发送请求并获得响应数据
- *
- *@paramurlString
- *@return
- *@throwsIOException
- */
- public static StringgetRequestHttp(StringurlString) throws IOException{
- return getRequestHttp(urlString, "utf-8" );
- }
- /**
- *向指定url发送请求并获得响应数据
- *
- *@paramurlString
- *@paramencoding
- *@return
- *@throwsIOException
- */
- public static StringgetRequestHttp(StringurlString,Stringencoding)
- throws IOException{
- return getRequestHttp(urlString,encoding, null , 5000 );
- }
- /**
- *向指定url发送请求并获得响应数据
- *
- *@paramurlString
- *@paramencoding
- *@paramparameter
- *@return
- *@throwsIOException
- */
- public static StringgetRequestHttp( final StringurlString,
- final Stringencoding, final Mapparameter, final int timeout)
- throws IOException{
- StringnURL=(urlString.startsWith( "http://" )||urlString
- .startsWith( "https://" ))?urlString:( "http:" +urlString)
- .intern();
- Stringuser= null ;
- Stringpassword= null ;
- Stringmethod= "GET" ;
- Stringpost= null ;
- Stringdigest= null ;
- StringresponseContent= "ERROR" ;
- boolean foundRedirect= false ;
- Mapheaders= new HashMap();
- if (parameter!= null ){
- SetentrySet=parameter.entrySet();
- for (Iteratorit=entrySet.iterator();it.hasNext();){
- Entryheader=(Entry)it.next();
- Stringkey=(String)header.getKey();
- Stringvalue=(String)header.getValue();
- if ( "user" .equals(key)){
- user=value;
- } else if ( "pass" .equals(key)){
- password=value;
- } else if ( "method" .equals(key)){
- method=value;
- } else if ( "post" .equals(key)){
- post=value;
- } else {
- headers.put(key,value);
- }
- }
- }
- URLurl= new URL(nURL);
- if (user!= null &&password!= null ){
- BASE64Encoderbase64= new BASE64Encoder();
- digest= "Basic"
- +base64.encode((user+ ":" +password).getBytes());
- }
- do {
- HttpURLConnectionurlConnection=(HttpURLConnection)url
- .openConnection();
- //添加访问授权
- if (digest!= null ){
- urlConnection.setRequestProperty( "Authorization" ,digest);
- }
- urlConnection.setDoOutput( true );
- urlConnection.setDoInput( true );
- urlConnection.setUseCaches( false );
- urlConnection.setInstanceFollowRedirects( false );
- urlConnection.setRequestMethod(method);
- if (timeout> 0 ){
- urlConnection.setConnectTimeout(timeout);
- }
- //模拟http头文件
- urlConnection.setRequestProperty( "User-Agent" , "Mozilla/4.0(compatible;MSIE7.0;)" );
- urlConnection.setRequestProperty( "Accept" , "image/gif,image/x-xbitmap,image/jpeg,image/pjpeg,application/x-shockwave-flash,application/msword,application/vnd.ms-excel,application/vnd.ms-powerpoint,*/*" );
- //追加http头文件
- SetheadersSet=headers.entrySet();
- for (Iteratorit=headersSet.iterator();it.hasNext();){
- Entryentry=(Entry)it.next();
- urlConnection.setRequestProperty((String)entry.getKey(),
- (String)entry.getValue());
- }
- if (post!= null ){
- OutputStreamWriteroutRemote= new OutputStreamWriter(
- urlConnection.getOutputStream());
- outRemote.write(post);
- outRemote.flush();
- }
- //获得响应状态
- int responseCode=urlConnection.getResponseCode();
- //获得返回的数据长度
- int responseLength=urlConnection.getContentLength();
- if (responseCode== 302 ){
- //重定向
- Stringlocation=urlConnection.getHeaderField( "Location" );
- url= new URL(location);
- foundRedirect= true ;
- } else {
- BufferedInputStreamin;
- if (responseCode== 200 ||responseCode== 201 ){
- in= new BufferedInputStream(urlConnection.getInputStream());
- } else {
- in= new BufferedInputStream(urlConnection.getErrorStream());
- }
- int size=responseLength==- 1 ? 4096 :responseLength;
- if (encoding!= null ){
- responseContent=SimpleWebClient.read(in,size,encoding);
- } else {
- ByteArrayOutputStreamout= new ByteArrayOutputStream();
- byte []bytes= new byte [size];
- int read;
- while ((read=in.read(bytes))>= 0 ){
- out.write(bytes, 0 ,read);
- }
- responseContent= new String(out.toByteArray());
- in.close();
- out.close();
- }
- foundRedirect= false ;
- }
- //如果重定向则继续
- } while (foundRedirect);
- return responseContent;
- }
- /**
- *转化InputStream为String
- *
- * @param in
- * @param size
- * @return
- * @throws IOException
- */
- private static Stringread( final InputStreamin, final int size,
- final Stringencoding) throws IOException{
- StringBuildersbr= new StringBuilder();
- int nSize=size;
- if (nSize== 0 ){
- nSize= 1 ;
- }
- char []buffer= new char [nSize];
- int offset= 0 ;
- InputStreamReaderisr= new InputStreamReader(in,encoding);
- while ((offset=isr.read(buffer))!=- 1 ){
- sbr.append(buffer, 0 ,offset);
- }
- in.close();
- isr.close();
- return sbr.toString();
- }
- }
WebAppraise.java
- package org.loon.test;
- import java.io.IOException;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- public class WebAppraise{
- private StringgoogleSum;
- private StringbaiduSum;
- private StringmsnSum;
- private StringaltaVistaSum;
- private StringallTheWebSum;
- private StringyahooSum;
- private StringtestURL;
- public WebAppraise( final Stringurl){
- if (url!= null &&! "" .equals(url)){
- this .testURL=url.trim();
- if ( this .testURL.startsWith( "http://" )){
- this .testURL= this .testURL.substring( 7 );
- }
- if ( this .testURL.startsWith( "https://" )){
- this .testURL= this .testURL.substring( 8 );
- }
- } else {
- throw new RuntimeException( "urlisNULL!" );
- }
- }
- /**
- *分析指定链接结果,并返回整型数值
- *
- *@paramsearchURL
- *@paramanchor
- *@paramtrail
- *@return
- */
- private static int getLinks( final StringsearchURL, final Stringanchor,
- final Stringtrail){
- int count= 0 ;
- StringserverResponse;
- try {
- //我国特色……
- if (searchURL.startsWith( "http://www.baidu.com" )){
- //永不离休的gb2312同志(-_-||)
- serverResponse=SimpleWebClient.getRequestHttp(searchURL,
- "gb2312" );
- } else {
- serverResponse=SimpleWebClient.getRequestHttp(searchURL);
- }
- } catch (IOExceptione){
- serverResponse=e.getMessage();
- }
- int pos=serverResponse.indexOf(anchor);
- if (pos> 1 ){
- serverResponse=serverResponse.substring(pos+anchor.length());
- pos=serverResponse.indexOf(trail);
- Stringvalue=serverResponse.substring( 0 ,pos).trim();
- value=value.replace( "," , "" );
- value=value.replace( "." , "" );
- count=Integer.parseInt(value);
- }
- return count;
- }
- public StringgetAllTheWebSite(){
- return getAllTheWebSite( false );
- }
- public StringgetAllTheWebSite( boolean isDomain){
- try {
- StringallTheWeb;
- if (isDomain){
- allTheWeb= "http://www.alltheweb.com/search?cat=web&cs=utf8&rys=0&itag=crv&_sb_lang=any&q=linkdomain%3A"
- + this .testURL;
- } else {
- allTheWeb= "http://www.alltheweb.com/search?cat=web&cs=utf-8&q=link%3Ahttp%3A%2F%2F"
- + this .testURL+ "&_sb_lang=any" ;
- }
- allTheWebSum= ""
- +getLinks(allTheWeb, "<spanclass=/"ofSoMany/">" ,
- "</span>" );
- } catch (Exceptionex){
- allTheWebSum=ex.getMessage();
- }
- return allTheWebSum;
- }
- public StringgetAltaVistaSite(){
- return getAltaVistaSite( false );
- }
- public StringgetAltaVistaSite( boolean isDomain){
- try {
- StringaltaVista;
- if (isDomain){
- altaVista= "http://www.altavista.com/web/results?itag=ody&q=link%3A"
- + this .testURL+ "&kgs=0&kls=0" ;
- } else {
- altaVista= "http://www.altavista.com/web/results?itag=ody&kgs=0&kls=0&q=site%3A"
- + this .testURL;
- }
- altaVistaSum= "" +getLinks(altaVista, "AltaVistafound" , "" );
- } catch (Exceptionex){
- altaVistaSum=ex.getMessage();
- }
- return altaVistaSum;
- }
- public StringgetGooglePR(){
- return GooglePageRank.GooglePR( this .testURL);
- }
- public StringgetGoogleSite(){
- return getGoogleSite( false );
- }
- public StringgetGoogleSite( final boolean isDomian){
- try {
- Stringgoogle;
- //反向链接
- if (isDomian){
- google= "http://www.google.com/search?hl=en&q=link%3A"
- + this .testURL;
- } else {
- google= "http://www.google.com/search?hl=en&q=site%3A"
- + this .testURL+ "&btnG=Google+Search&aq=f&oq=" ;
- }
- googleSum= "" +getLinks(google, "about<b>" , "</b>" );
- } catch (Exceptionex){
- googleSum=ex.getMessage();
- }
- return googleSum;
- }
- public StringgetBaiduSite(){
- return getBaiduSite( false );
- }
- public StringgetBaiduSite( final boolean isDomian){
- try {
- Stringbaidu;
- if (isDomian){
- baidu= "http://www.baidu.com/s?wd=domain%3A" + this .testURL
- + "&cl=3" ;
- } else {
- baidu= "http://www.baidu.com/s?wd=site%3A" + this .testURL;
- }
- baiduSum= "" +getLinks(baidu, "找到相关网页" , "篇" );
- } catch (Exceptionex){
- Stringbaidu;
- if (isDomian){
- baidu= "http://www.baidu.com/s?wd=domain%3A" + this .testURL
- + "&cl=3" ;
- } else {
- baidu= "http://www.baidu.com/s?wd=site%3A" + this .testURL;
- }
- baiduSum= "" +getLinks(baidu, "找到相关网页约" , "篇" );
- }
- return baiduSum;
- }
- public StringgetYahooSite(){
- return getYahooSite( false );
- }
- public StringgetYahooSite( final boolean isDomian){
- try {
- Stringyahoo;
- if (isDomian){
- yahoo= "http://sitemap.cn.yahoo.com/search?p=" + this .testURL
- + "&bwm=i" ;
- yahooSum= "" +getLinks(yahoo, "<strong>" , "</strong>" );
- } else {
- yahoo= "http://www.yahoo.cn/s?p=site%3A" + this .testURL
- + "&pid=hp&v=web" ;
- yahooSum= "" +getLinks(yahoo, "找到相关网页约" , "条" );
- }
- } catch (Exceptionex){
- yahooSum=ex.getMessage();
- }
- return yahooSum;
- }
- public StringgetMsnSite(){
- return getMsnSite( false );
- }
- public StringgetMsnSite( boolean isDomain){
- try {
- Stringmsn;
- if (isDomain){
- msn= "http://cnweb.search.live.com/results.aspx?q=link%3A"
- + this .testURL+ "&mkt=zh-cn&scope=&FORM=LIVSO" ;
- } else {
- msn= "http://cnweb.search.live.com/results.aspx?q=site%3A"
- + this .testURL+ "&go=&form=QBRE" ;
- }
- msnSum= "" +getLinks(msn, "共" , "条搜索结果" );
- } catch (Exceptionex){
- msnSum=ex.getMessage();
- }
- return msnSum;
- }
- public StringgetTestURL(){
- return testURL;
- }
- }
Test.java
- packageorg.loon.test;
- /**
- *Copyright2008
- *
- *LicensedundertheApacheLicense,Version2.0(the"License");youmaynot
- *usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof
- *theLicenseat
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- *Unlessrequiredbyapplicablelaworagreedtoinwriting,software
- *distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT
- *WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe
- *Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder
- *theLicense.
- *
- *@projectloonframework
- *@authorchenpeng
- *@email:ceponline@yahoo.com.cn
- *@version0.1
- */
- public class Test{
- public static void main(String[]args){
- WebAppraiseappraise= new WebAppraise( "http://blog.csdn.net/cping1982" );
- System. out .println( "GooglePagerRank值:" +appraise.getGooglePR());
- System. out .println( "google收录:" +appraise.getGoogleSite());
- System. out .println( "google反向收录:" +appraise.getGoogleSite( true ));
- System. out .println( "yahoo收录:" +appraise.getYahooSite());
- System. out .println( "yahoo反向收录:" +appraise.getYahooSite( true ));
- System. out .println( "baidu收录:" +appraise.getBaiduSite());
- System. out .println( "baidu反向收录:" +appraise.getBaiduSite( true ));
- System. out .println( "msn收录:" +appraise.getMsnSite());
- System. out .println( "msn反向收录:" +appraise.getMsnSite( true ));
- System. out .println( "AllTheWeb收录:" +appraise.getAllTheWebSite());
- System. out .println( "AllTheWeb反向收录:" +appraise.getAllTheWebSite( true ));
- System. out .println( "AltaVista收录:" +appraise.getAltaVistaSite());
- System. out .println( "AltaVista反向收录:" +appraise.getAltaVistaSite( true ));
- }
- }
检测 http://blog.csdn.net/cping1982 运行结果如下图:
源码下载地址: http://download.csdn.net/source/929348