[ OpenAI / WebsiteQnA tutorial ] 데이터 수집 - beautifulsoup 라이브러리를 통한 크롤링 (1)

Openai 2023. 2. 28. 20:34

import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "openai.com"
full_url = "https://openai.com/"

1. 데이터 수집 - beautifulsoup 라이브러리를 통한 크롤링

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

HTMLparser 클래스를 상속받는 HyperlinkParser 클래스를 정의합니다. 생성자 오버라이딩을 통해 부모 클래스의 생성자를 호출한 뒤 hyperlinks 변수를 정의합니다. 또한 부모 클래스에 정의되어 있던 메서드 handle_Starttag()를 오버라이딩하여 태그가 anchor 태그이며 href 속성을 지닌 경우 클래스 변수 hyperlinks에 href를 추가하도록 합니다. (HTMLParser에서 feed 메서드는 받은 인자를 처리한 뒤 handle_starttag 등 메서드에 인자로 준 뒤 호출합니다.)

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

get_hyperlink 함수는 인자로 받은 URL에서 href를 추출한 뒤 배열에 담아 반환합니다. urllib.request 모듈의 urlopen은 인자로 받은 url의 HTTPResponse Object를 반환하며 해당 객체는 info, read, readline 등의 메서드를 포함합니다. (자세한 내용은 아래 링크에서 확인 가능합니다.) [ with urllib.request.urlopen(url) as response ]에서는 해당 url의 HTTPResponse Object를 response에 저장하고 .info()를 통해 html 문서 여부를 판단한 뒤 html 문서인 경우 utf-8 방식으로 디코딩한 뒤 그 결과를 html 변수에 저장해 앞서 정의한 HyperlinkParser 인스턴스의 feed()에 인자로 전달해 href를 추출합니다. 여기서 with를 사용한 이유는 보다 간결하고 안전하게 리소스를 사용하고 해제하기 위해서입니다.(with 절이 끝나면 open 한 리소스는 자동으로 해제됩니다.)

[python module / urllib] urlopen() - URL의 데이터를 확인하는 방법

urlopen()은 urllib.request를 import함으로써 사용 가능합니다. urlopen()을 통해 인자로 받는 url의 response data를 얻거나 데이터를 POST 방식으로 서버에 전송 가능합니다. urlopen()은 HTTPResponse object를 반환합

ojhallae.tistory.com

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

get_domain_hyperlinks 함수에서는 앞서 정의한 get_hyperlinks 함수를 통해 얻은 href 들을 정리하는 함수입니다. 각각의 href가 absolute URL인지 확인 한 뒤 맞다면, urlparse 함수를 사용해 href의 domain을 추출한 뒤 local_domain과 비교하고 동일하다면 마지막 부분의 '/'를 제거하고 clean_links 배열에 추가합니다. 만약 absolute URL이 아닌 relative URL이라면, href가 '/'로 시작하는 경우 이를 제거한 뒤 마지막 부분의 '/'를 제거하고 clean_links 배열에 추가한다 그 외의 경우 continue를 통해 생략하고 다음 href에 대해 동일한 작업을 진행합니다.

def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")
            
            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

crawl(full_url)

crawl 함수에서는 앞서 정의한 함수와 클래스를 활용해 url에 대한 크롤링을 진행합니다. 접근할 각 url을 queue에 담은뒤 활용하며, set 자료구조를 활용해 방문 여부를 확인하기 위한 seen 변수를 선언합니다. 각 url을 방문하기 전에 크롤링한 text 파일을 저장할 디렉토리와 csv파일을 저아할 디렉토리를 생성합니다.

각 url에 대한 방문은 while 문과 queue를 통해 이루어집니다. 각 순회과정에서는 dequeue 연산(이 경우 .pop())을 통해 방문할 url을 url 변수에 저장하며 해당 url을 title로 가지는 파일을 생성하고 작성합니다. 파일에 쓰여지는 텍스트는 BeautifulSoup() 객체를 통해 얻어지는데, 이때 BeautifulSoup() 객체는 requests 라이브러리를 이용해 보낸 get 요청에 대한 응답과 parser 타입을 인자로 받아 생성됩니다. 예시를 통핸 이해를 원한다면 아래 링크에서 가능합니다.

가비아 라이브러리

IT 콘텐츠 허브

library.gabia.com

또한, for문을 통해 get_domain_hyperlinks()의 반환값(링크들)을 순회하며 방문하지 않은 링크를 queue에 추가한뒤 방문처리하여 모든 링크를 방문하도록 합니다. 이러한 과정을 거쳐 crawl 함수는 각 페이지의 내용을 포함하는 텍스트 파일을 url을 제목으로 하여 저장합니다.

'Openai' 카테고리의 다른 글

[ OpenAI / WebsiteQnA tutorial ] 총정리 (1)	2023.02.28
[ OpenAI / WebsiteQnA tutorial ] Embedding을 이용한 Context 생성 및 응답 (4) (0)	2023.02.28
[ OpenAI / WebsiteQnA tutorial ] Embedding - openai 라이브러리를 통한 Embedding (3) (0)	2023.02.28
[ OpenAI / WebsiteQnA tutorial ] 데이터 가공 - tiktoken 라이브러리를 통한 데이터 프로세싱 (2) (0)	2023.02.28

ABOUT ME

COMPMOS COMPMOS

1. 데이터 수집 - beautifulsoup 라이브러리를 통한 크롤링

'Openai' 카테고리의 다른 글

티스토리툴바

ABOUT ME

1. 데이터 수집 - beautifulsoup 라이브러리를 통한 크롤링

'Openai' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바