#-*- coding: utf-8 -*-
import sys
import os
import ssl
from socket import *
def main(argv):
rstCrawl = crawl("news.naver.com", 443, "/main/main.nhn", "?mode=LSD&mid=shm&sid1=102")
# os.system("Pause")
def crawl(dHost, target_port="80", path="/", paramGET=""):
# 소켓 준비.
print(" (Notice) Prepare the socket.")
client = socket(AF_INET, SOCK_STREAM)
if (target_port == 80):
s_sock = client
elif (target_port == 443):
context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
s_sock = context.wrap_socket(client, server_hostname=dHost)
s_sock.connect((dHost, target_port))
# 송신용 요청 메시지 준비.
print(" (Notice) Prepare the request message.")
request = "GET "+ path + paramGET +" HTTP/1.1\r\n"
request += "Host:" + dHost + "\r\n"
request += "\r\n"
# 반복: 디코딩 성공 시 탈출.
flagBreakDecode = False
while(flagBreakDecode == False):
s_sock.send(request.encode())
# 반복: 데이터 수신 완료 시 탈출.
httpResponse = b""
flagBreakRecv = False
while(flagBreakRecv == False):
tmpHttpResponse = s_sock.recv(4096)
if ("0\\r\\n\\r\\n" in str(tmpHttpResponse)) == True:
flagBreakRecv = True
httpResponse += tmpHttpResponse
httpResponseLen = len(httpResponse)
print(" (Notice) Recieved data(Byte):", httpResponseLen)
try:
dHttpResponse = httpResponse.decode("euc-kr")
# dHttpResponse = httpResponse.decode()
flagBreakDecode = True
print(" (Notice) Complete decoding.")
except UnicodeDecodeError:
print(" (Error) UnicodeDecodeError. → Modify the way of decoding.")
continue
# charset 제거: 한글 식별 목적.
dHttpResponse = dHttpResponse.replace("<meta charset=\"utf-8\">", "")
dHttpResponse = dHttpResponse.replace("<meta charset=\"euc-kr\">", "")
# 결과 출력
with open("./" + dHost + "_crawl.html", "w", encoding="utf-8") as f:
f.write(dHttpResponse)
print(" (Notice) Complete crawl.")
return dHttpResponse
if __name__ == '__main__':
main(sys.argv)
import sys
import os
import ssl
from socket import *
def main(argv):
rstCrawl = crawl("news.naver.com", 443, "/main/main.nhn", "?mode=LSD&mid=shm&sid1=102")
# os.system("Pause")
def crawl(dHost, target_port="80", path="/", paramGET=""):
# 소켓 준비.
print(" (Notice) Prepare the socket.")
client = socket(AF_INET, SOCK_STREAM)
if (target_port == 80):
s_sock = client
elif (target_port == 443):
context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
s_sock = context.wrap_socket(client, server_hostname=dHost)
s_sock.connect((dHost, target_port))
# 송신용 요청 메시지 준비.
print(" (Notice) Prepare the request message.")
request = "GET "+ path + paramGET +" HTTP/1.1\r\n"
request += "Host:" + dHost + "\r\n"
request += "\r\n"
# 반복: 디코딩 성공 시 탈출.
flagBreakDecode = False
while(flagBreakDecode == False):
s_sock.send(request.encode())
# 반복: 데이터 수신 완료 시 탈출.
httpResponse = b""
flagBreakRecv = False
while(flagBreakRecv == False):
tmpHttpResponse = s_sock.recv(4096)
if ("0\\r\\n\\r\\n" in str(tmpHttpResponse)) == True:
flagBreakRecv = True
httpResponse += tmpHttpResponse
httpResponseLen = len(httpResponse)
print(" (Notice) Recieved data(Byte):", httpResponseLen)
try:
dHttpResponse = httpResponse.decode("euc-kr")
# dHttpResponse = httpResponse.decode()
flagBreakDecode = True
print(" (Notice) Complete decoding.")
except UnicodeDecodeError:
print(" (Error) UnicodeDecodeError. → Modify the way of decoding.")
continue
# charset 제거: 한글 식별 목적.
dHttpResponse = dHttpResponse.replace("<meta charset=\"utf-8\">", "")
dHttpResponse = dHttpResponse.replace("<meta charset=\"euc-kr\">", "")
# 결과 출력
with open("./" + dHost + "_crawl.html", "w", encoding="utf-8") as f:
f.write(dHttpResponse)
print(" (Notice) Complete crawl.")
return dHttpResponse
if __name__ == '__main__':
main(sys.argv)
When the above Python program is executed, the file "xxx_crawl.html" is created in the same location.
위 파이썬 프로그램을 실행하면, 동일 위치에 "xxx_crawl.html" 파일이 생성된다.