반응형
네이버에 현재 서비스 중인 모든 웹툰을 크롤링해서 결과를 엑셀 파일로 저장하는 코드이다
가져오는 정보는, 시리즈ID, 작품명, 저자, 성인작품여부, 별점, 연재요일 등이 있다
import openpyxl
import requests
from tqdm import tqdm
import time
def series_info(series_id):
'''시리즈ID를 입력받아, 장르와 관심수를 반환한다'''
series_info_url = 'https://comic.naver.com/api/article/list/info?titleId=' + str(series_id)
series_res = requests.get(series_info_url)
series_info = series_res.json()
try:
favorite_count = series_info['favoriteCount']
genre = [genre['tagName'] for genre in series_info['curationTagList'] if genre['curationType'][:6] == 'GENRE_'][0]
except: # 성인작품은 로그인 필요하여 에러 발생함
favorite_count = None
genre = None
return favorite_count, genre
def episode_info(series_id):
'''시리즈ID를 입력받아, 에피소드개수와 완결일을 반환한다'''
episode_info_url = 'https://comic.naver.com/api/article/list?titleId=' + str(series_id) + '&page=1&sort=DESC'
episode_res = requests.get(episode_info_url)
episode_info = episode_res.json()
try:
epi_cnt = episode_info['totalCount']
completed_date = episode_info['articleList'][0]['serviceDateDescription']
except: # 성인작품은 로그인 필요하여 에러 발생함
epi_cnt = None
completed_date = None
return epi_cnt, completed_date
## 완결작 ## 최대 페이지 수정 필요
start_page = 1
last_page = 40
wb = openpyxl.Workbook()
ws = wb.active
ws.append([
'series_id', 'series_title', 'author', 'is_adult', 'star_score', 'favorite_count', 'genre', 'epi_cnt', 'completed_date'
])
for page in tqdm(range(start_page, last_page + 1)):
time.sleep(1)
url = 'https://comic.naver.com/api/webtoon/titlelist/finished?page=' + str(page) + '&pageSize=45&order=UPDATE'
res = requests.get(url)
contents = res.json()['titleList']
# 요청 실패 시 처리
if res.status_code != 200:
print(str(page) + '_' + str(res.status_code))
continue
# 파싱
for content in contents:
series_id = content['titleId']
series_title = content['titleName']
author = content['author']
adult = content['adult']
star_score = content['starScore']
favorite_count, genre = series_info(series_id)
epi_cnt, completed_date = episode_info(series_id)
ws.append([
series_id, series_title, author, adult, star_score, favorite_count, genre, epi_cnt, completed_date
])
wb.save('네이버웹툰_완결작.xlsx')
## 요일연재
wb = openpyxl.Workbook()
ws = wb.active
ws.append([
'series_id', 'series_title', 'author', 'is_adult', 'star_score', 'day'
])
url = 'https://comic.naver.com/api/webtoon/titlelist/weekday?order=user'
res = requests.get(url)
contents = res.json()['titleListMap']
for day in contents.keys():
for content in contents[day]:
series_id = content['titleId']
series_title = content['titleName']
author = content['author']
adult = content['adult']
star_score = content['starScore']
day_ = day
ws.append([
series_id, series_title, author, adult, star_score, day_
])
wb.save('네이버웹툰_연재_요일연재.xlsx')
## 매일연재
wb = openpyxl.Workbook()
ws = wb.active
ws.append([
'series_id', 'series_title', 'author', 'is_adult', 'star_score'
])
url = 'https://comic.naver.com/api/webtoon/titlelist/weekday?week=dailyPlus&order=user'
res = requests.get(url)
contents = res.json()['titleList']
# 파싱
for content in contents:
series_id = content['titleId']
series_title = content['titleName']
author = content['author']
adult = content['adult']
star_score = content['starScore']
ws.append([
series_id, series_title, author, adult, star_score
])
wb.save('네이버웹툰_연재_매일플러스.xlsx')
반응형
'파이썬' 카테고리의 다른 글
python 웹툰 크롤링 시리즈3: 카카오페이지 (2) | 2023.03.19 |
---|---|
python 웹툰 크롤링 시리즈2: 카카오웹툰 (1) | 2023.03.18 |
tqdm 사용법 파이썬 (0) | 2022.06.15 |
jupyter lab에서 ipython-sql로 DB 접속하기(SQL Server, MySQL, postgreSQL) (0) | 2021.08.25 |
파이썬 기초 통계분석 (0) | 2020.10.20 |