파이썬

python 웹툰 크롤링 시리즈1: 네이버웹툰

박범준2 2023. 3. 17. 00:03
반응형

네이버에 현재 서비스 중인 모든 웹툰을 크롤링해서 결과를 엑셀 파일로 저장하는 코드이다

가져오는 정보는, 시리즈ID, 작품명, 저자, 성인작품여부, 별점, 연재요일 등이 있다

import openpyxl
import requests
from tqdm import tqdm
import time


def series_info(series_id):
    '''시리즈ID를 입력받아, 장르와 관심수를 반환한다'''
    series_info_url = 'https://comic.naver.com/api/article/list/info?titleId=' + str(series_id)
    series_res = requests.get(series_info_url)
    series_info = series_res.json()

    try: 
        favorite_count = series_info['favoriteCount']
        genre = [genre['tagName'] for genre in series_info['curationTagList'] if genre['curationType'][:6] == 'GENRE_'][0]
    except: # 성인작품은 로그인 필요하여 에러 발생함
        favorite_count = None
        genre = None
    return favorite_count, genre

def episode_info(series_id):
    '''시리즈ID를 입력받아, 에피소드개수와 완결일을 반환한다'''
    episode_info_url = 'https://comic.naver.com/api/article/list?titleId=' + str(series_id) + '&page=1&sort=DESC'
    episode_res = requests.get(episode_info_url)
    episode_info = episode_res.json()
    
    try:
        epi_cnt = episode_info['totalCount']
        completed_date = episode_info['articleList'][0]['serviceDateDescription']
    except: # 성인작품은 로그인 필요하여 에러 발생함
        epi_cnt = None
        completed_date = None
    return epi_cnt, completed_date


## 완결작 ## 최대 페이지 수정 필요

start_page = 1
last_page = 40

wb = openpyxl.Workbook()
ws = wb.active
ws.append([
    'series_id', 'series_title', 'author', 'is_adult', 'star_score', 'favorite_count', 'genre', 'epi_cnt', 'completed_date'
])

for page in tqdm(range(start_page, last_page + 1)):
    time.sleep(1)

    url = 'https://comic.naver.com/api/webtoon/titlelist/finished?page=' + str(page) + '&pageSize=45&order=UPDATE'
    res = requests.get(url)
    contents = res.json()['titleList']
    
    # 요청 실패 시 처리
    if res.status_code != 200:
        print(str(page) + '_' + str(res.status_code))
        continue
    
    # 파싱
    for content in contents:
        series_id = content['titleId']
        series_title = content['titleName']
        author = content['author']
        adult = content['adult']
        star_score = content['starScore']
        
        favorite_count, genre = series_info(series_id)
        epi_cnt, completed_date = episode_info(series_id)
        
        ws.append([
            series_id, series_title, author, adult, star_score, favorite_count, genre, epi_cnt, completed_date
        ])

wb.save('네이버웹툰_완결작.xlsx')


## 요일연재

wb = openpyxl.Workbook()
ws = wb.active
ws.append([
    'series_id', 'series_title', 'author', 'is_adult', 'star_score', 'day'
])


url = 'https://comic.naver.com/api/webtoon/titlelist/weekday?order=user'
res = requests.get(url)
contents = res.json()['titleListMap']


for day in contents.keys():
    for content in contents[day]:

        series_id = content['titleId']
        series_title = content['titleName']
        author = content['author']
        adult = content['adult']
        star_score = content['starScore']
        day_ = day

        ws.append([
            series_id, series_title, author, adult, star_score, day_
        ])
wb.save('네이버웹툰_연재_요일연재.xlsx')


## 매일연재

wb = openpyxl.Workbook()
ws = wb.active
ws.append([
    'series_id', 'series_title', 'author', 'is_adult', 'star_score'
])


url = 'https://comic.naver.com/api/webtoon/titlelist/weekday?week=dailyPlus&order=user'
res = requests.get(url)
contents = res.json()['titleList']

# 파싱
for content in contents:
    series_id = content['titleId']
    series_title = content['titleName']
    author = content['author']
    adult = content['adult']
    star_score = content['starScore']

    ws.append([
        series_id, series_title, author, adult, star_score
    ])

wb.save('네이버웹툰_연재_매일플러스.xlsx')
반응형