Windows下比较简单的获取网页源码的方法

第一个方法是使用MFC里面的

CString GetHttpFileData(CString strUrl) { CInternetSession Session("Internet Explorer", 0); CHttpFile *pHttpFile = NULL; CString strData; CString strClip; pHttpFile = (CHttpFile*)Session.OpenURL(strUrl); while ( pHttpFile->ReadString(strClip) ) { strData += strClip; } return strData; }

要讲一下,pHttpFile->ReadString() 每次可能只读一个数据片断,读多少次取决于网络状况,所以要把每次读到的数据加到总数据的尾部,用了CString 省去了缓冲区处理:)
别忘了包含头文件#include在工程设置,里面要选择 using MFC 要不然编译不了
【Windows下比较简单的获取网页源码的方法】第二种是使用WinNet的纯API实现的
#define MAXBLOCKSIZE 1024 #include #include #pragma comment(lib, "wininet.lib")void GetWebSrcCode(const char *Url); int _tmain(int argc, _TCHAR* argv[]) { GetWebSrcCode("http://www.cnblogs.com/"); return 0; }void GetWebSrcCode(const char *Url) { HINTERNET hSession = InternetOpen("zwt", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0); if (hSession != NULL) { HINTERNET hURL = InternetOpenUrl(hSession, Url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0); if (hURL != NULL) { char Temp[MAXBLOCKSIZE] = {0}; ULONG Number = 1; FILE *stream; if( (stream = fopen( "E:\\test.html", "wb" )) != NULL ) { while (Number > 0) { InternetReadFile(hURL, Temp, MAXBLOCKSIZE - 1, &Number); fwrite(Temp, sizeof (char), Number , stream); } fclose( stream ); }InternetCloseHandle(hURL); hURL = NULL; }InternetCloseHandle(hSession); hSession = NULL; } }

第三种就是使用非封装过的Socket实现了
int main(int argc, char* argv[]) { SOCKET hsocket; SOCKADDR_IN saServer; WSADATA wsadata; LPHOSTENT lphostent; int nRet; char Dest[3000]; char* host_name="blog.sina.com.cn"; char* req="GET /s/blog_44acab2f01016gz3.html HTTP/1.1\r\n" "User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0C; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)\r\n" "Host:blog.sina.com.cn\r\n\r\n"; // 初始化套接字 if(WSAStartup(MAKEWORD(2,2),&wsadata)) printf("初始化SOCKET出错!"); lphostent=gethostbyname(host_name); if(lphostent==NULL) printf("lphostent为空!"); hsocket = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP); saServer.sin_family = AF_INET; saServer.sin_port = htons(80); saServer.sin_addr =*((LPIN_ADDR)*lphostent->h_addr_list); // 利用SOCKET连接 nRet = connect(hsocket,(LPSOCKADDR)&saServer,sizeof(SOCKADDR_IN)); if(nRet == SOCKET_ERROR) { printf("建立连接时出错!"); closesocket(hsocket); return 0; } // 利用SOCKET发送nRet = send(hsocket,req,strlen(req),0); if(nRet==SOCKET_ERROR) { printf("发送数据包时出错!"); closesocket(hsocket); } nRet=1; while(nRet>0) { // 接收返回数据包 nRet=recv(hsocket,(LPSTR)Dest,sizeof(Dest),0); if(nRet>0) Dest[nRet]=0; else Dest[0]=0; char sDest[3000] = {0}; UTF8_2_GB2312(sDest,nRet,Dest,nRet); // 显示返回数据包的大小、内容 //printf("\nReceived bytes:%d\n",nRet); printf("Result:\n%s",sDest); } }

另外,以上我们获取网页的时候,获取到的可能是UTF8,似乎目前大多数网站都用的这种编码吧!下面是编码转换。
void UTF_8ToUnicode(wchar_t* pOut,char *pText) { char* uchar = (char *)pOut; uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F); uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F); } void Gb2312ToUnicode(wchar_t* pOut,char *gbBuffer) { ::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1); } void UTF_8ToGB2312(char*pOut, char *pText, int pLen) { char Ctemp[4]; memset(Ctemp,0,4); int i =0 ,j = 0; while(i < pLen) { if(pText[i] >= 0) { pOut[j++] = pText[i++]; } else { WCHAR Wtemp; UTF_8ToUnicode(&Wtemp,pText + i); UnicodeToGB2312(Ctemp,Wtemp); pOut[j] = Ctemp[0]; pOut[j + 1] = Ctemp[1]; i += 3; j += 2; } } pOut[j] ='\n'; return; }

这是是转换成GB2312的代码
转载于:https://www.cnblogs.com/croot/p/3391003.html

    推荐阅读