Linux获取网页源码的几种方法 - 遗世之都 - ITeye技术网站
JavaEye博客还是本科做毕业设计时候开通的,基本上荒废了,现在决定记录下平时编程遇到的问题或者解决方案。
第一个为利用linux下的工具来获取网页源码,我用的是Wget,也可以使用Curl,curl的话更加的灵活,可以设置很多参数
- //通过Wget来获取网页
- string GetHtmlByWget(string url)
- {
- //获取待下载网页文件名
- string fileName = url.substr(( int )url.find_last_of( "/" ) + 1);
- if (fileName != "" )
- {
- string strCom = "wget -q " ; //wget命令,-q表示不显示下载信息
- strCom.append(url);
- system(strCom.c_str()); //执行wget
- ifstream fin(fileName.c_str());
- if (!fin)
- {
- return "" ;
- }
- string strHtml = "" ;
- char chTemp[1024] = "" ;
- //读取网页文件到内存中
- while (fin.getline(chTemp , 1024))
- {
- strHtml.append(string(chTemp));
- strcpy(chTemp , "" );
- }
- fin.close();
- strCom = "rm -f " ; //删除文件命令,-f表示直接删除不做任何提示
- strCom.append(fileName);
- system(strCom.c_str()); //删除刚才下载下来的文件
- return strHtml; //返回网页源码
- }
- else
- {
- return "" ;
- }
- }
第二个是用的socket的来获取源码
- //通过GET获取网页源码
- string GetHtmlByGet(string url)
- {
- string strHtmlContent = "" ;
- int sockfd;
- struct sockaddr_in addr;
- struct hostent *pURL;
- char text[RECVBUF];
- //分析链接
- UrlInfo urlInfo = ParseURL(url);
- string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate" ;
- //不同的主机UserAgent不同
- string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10" ;
- //将端口转换为字符串
- char t[6];
- string strPort;
- sprintf(t, "%d" , urlInfo.Port);
- strPort = t;
- //构造发送字符串
- string strRequest = "" ;
- strRequest.append( "GET " );
- strRequest.append(urlInfo.File);
- strRequest.append( "?" );
- strRequest.append(urlInfo.Body);
- strRequest.append( " HTTP/1.1\r\n" );
- strRequest.append(sAccept);
- strRequest.append( "\r\nUser-Agent:" );
- strRequest.append(sUserAgent);
- strRequest.append( "\r\nHost:" );
- strRequest.append(urlInfo.Host);
- strRequest.append( ":" );
- strRequest.append(strPort);
- strRequest.append( "\r\nConnection: Keep-Alive\r\n\r\n" );
- char * host = const_cast < char *>(urlInfo.Host.c_str());
- sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式发送
- pURL = gethostbyname(host);
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = *((unsigned long *)pURL->h_addr);
- addr.sin_port = htons(80);
- //连接
- connect(sockfd,( struct sockaddr *)&addr, sizeof (addr));
- //发送
- send(sockfd, const_cast < char *>(strRequest.c_str()), strRequest.length(), 0);
- //接受
- while (recv(sockfd, text, RECVBUF, 0) > 0)
- {
- strHtmlContent.append(text);
- bzero(text,RECVBUF);
- }
- //关闭socket
- close(sockfd);
- //返回接受结果
- return strHtmlContent;
- }
使用libcurl
- #include <stdio.h>
- #include <string.h>
- #include <curl/curl.h>
- #define MAX_BUF 65536
- char wr_buf[MAX_BUF+ 1 ];
- int wr_index;
- /*
- * Write data callback function (called within the context of
- * curl_easy_perform.
- */
- size_t write_data( void *buffer, size_t size, size_t nmemb, void *userp )
- {
- int segsize = size * nmemb;
- /* Check to see if this data exceeds the size of our buffer. If so,
- * set the user-defined context value and return 0 to indicate a
- * problem to curl.
- */
- if ( wr_index + segsize > MAX_BUF ) {
- *( int *)userp = 1 ;
- return 0 ;
- }
- /* Copy the data from the curl buffer into our buffer */
- memcpy( ( void *)&wr_buf[wr_index], buffer, (size_t)segsize );
- /* Update the write index */
- wr_index += segsize;
- /* Null terminate the buffer */
- wr_buf[wr_index] = 0 ;
- /* Return the number of bytes received, indicating to curl that all is okay */
- return segsize;
- }
- /*
- * Simple curl application to read the index.html file from a Web site.
- */
- int main( void )
- {
- CURL *curl;
- CURLcode ret;
- int wr_error;
- wr_error = 0 ;
- wr_index = 0 ;
- /* First step, init curl */
- curl = curl_easy_init();
- if (!curl) {
- printf( "couldn't init curl\n" );
- return 0 ;
- }
- /* Tell curl the URL of the file we're going to retrieve */
- curl_easy_setopt( curl, CURLOPT_URL, "www.exampledomain.com" );
- /* Tell curl that we'll receive data to the function write_data, and
- * also provide it with a context pointer for our error return.
- */
- curl_easy_setopt( curl, CURLOPT_WRITEDATA, ( void *)&wr_error );
- curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, write_data );
- /* Allow curl to perform the action */
- ret = curl_easy_perform( curl );
- printf( "ret = %d (write_error = %d)\n" , ret, wr_error );
- /* Emit the page if curl indicates that no errors occurred */
- if ( ret == 0 ) printf( "%s\n" , wr_buf );
- curl_easy_cleanup( curl );
- return 0 ;
- }