July 01, 2019

How to create HTTP request message(crawling) using Python



#-*- coding: utf-8 -*-

import sys
import os
import ssl
from socket import *

def main(argv):

    rstCrawl = crawl("news.naver.com", 443, "/main/main.nhn", "?mode=LSD&mid=shm&sid1=102") 
    # os.system("Pause") 
 
 
def crawl(dHost, target_port="80", path="/", paramGET=""):
     
    # 소켓 준비.
    print("  (Notice) Prepare the socket.")
    client = socket(AF_INET, SOCK_STREAM)
    if (target_port == 80):
        s_sock = client
    elif (target_port == 443):
        context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
        s_sock = context.wrap_socket(client, server_hostname=dHost)
    s_sock.connect((dHost, target_port))
   
    # 송신용 요청 메시지 준비.
    print("  (Notice) Prepare the request message.")
    request = "GET "+ path + paramGET +" HTTP/1.1\r\n"
    request += "Host:" + dHost + "\r\n"
    request += "\r\n"
 
    # 반복: 디코딩 성공 시 탈출.
    flagBreakDecode = False
    while(flagBreakDecode == False):
 
        s_sock.send(request.encode())
     
        # 반복: 데이터 수신 완료 시 탈출.
        httpResponse = b""
        flagBreakRecv = False
        while(flagBreakRecv == False):
            tmpHttpResponse = s_sock.recv(4096)
            if ("0\\r\\n\\r\\n" in str(tmpHttpResponse)) == True:
                flagBreakRecv = True
            httpResponse += tmpHttpResponse
     
        httpResponseLen = len(httpResponse)
        print("  (Notice) Recieved data(Byte):", httpResponseLen)
 
        try:
            dHttpResponse = httpResponse.decode("euc-kr")
            # dHttpResponse = httpResponse.decode()
            flagBreakDecode = True
            print("  (Notice) Complete decoding.")
        except UnicodeDecodeError:
            print("  (Error) UnicodeDecodeError. → Modify the way of decoding.")
            continue
         
    # charset 제거: 한글 식별 목적. 
    dHttpResponse = dHttpResponse.replace("<meta charset=\"utf-8\">", "")
    dHttpResponse = dHttpResponse.replace("<meta charset=\"euc-kr\">", "")

    # 결과 출력
    with open("./" + dHost + "_crawl.html", "w", encoding="utf-8") as f:
        f.write(dHttpResponse)

    print("  (Notice) Complete crawl.")
    return dHttpResponse
 
 
if __name__ == '__main__':
    main(sys.argv)

When the above Python program is executed, the file "xxx_crawl.html" is created in the same location.

위 파이썬 프로그램을 실행하면, 동일 위치에 "xxx_crawl.html" 파일이 생성된다.