반응형
웹툰 크롤링 시리즈 마지막 편은 카카오페이지이다
카카페의 전체 작품의 시리즈ID, 작품명, 장르, 작가, 연령가, 조회수, 마지막연재일, 연재상태 정보를 가져오는 코드이다
카카페는 GraphQL을 통해 데이터를 요청해야해서 코드가 길다
import openpyxl
import requests
from tqdm import tqdm
import time
## 전체작품
url = 'https://page.kakao.com/graphql'
query = '''query staticLandingGenreSection($sectionId: ID!, $param: StaticLandingGenreParamInput!) {
staticLandingGenreSection(sectionId: $sectionId, param: $param) {
...Section
__typename
}
}
fragment Section on Section {
id
uid
type
title
... on DependOnLoggedInSection {
loggedInTitle
loggedInScheme
__typename
}
... on SchemeSection {
scheme
__typename
}
... on MetaInfoTypeSection {
metaInfoType
__typename
}
... on TabSection {
sectionMainTabList {
uid
title
isSelected
scheme
additionalString
subTabList {
uid
title
isSelected
groupId
__typename
}
__typename
}
__typename
}
... on ThemeKeywordSection {
themeKeywordList {
uid
title
scheme
__typename
}
__typename
}
... on StaticLandingDayOfWeekSection {
isEnd
totalCount
displayAd {
sectionUid
bannerUid
treviUid
momentUid
__typename
}
param {
categoryUid
businessModel {
name
param
__typename
}
subcategory {
name
param
__typename
}
dayTab {
name
param
__typename
}
page
size
__typename
}
businessModelList {
name
param
__typename
}
subcategoryList {
name
param
__typename
}
dayTabList {
name
param
__typename
}
__typename
}
... on StaticLandingTodayNewSection {
totalCount
param {
categoryUid
subcategory {
name
param
__typename
}
__typename
}
categoryTabList {
name
param
__typename
}
subcategoryList {
name
param
__typename
}
__typename
}
... on StaticLandingTodayUpSection {
isEnd
totalCount
param {
categoryUid
subcategory {
name
param
__typename
}
page
__typename
}
categoryTabList {
name
param
__typename
}
subcategoryList {
name
param
__typename
}
__typename
}
... on StaticLandingRankingSection {
isEnd
rankingTime
totalCount
param {
categoryUid
subcategory {
name
param
__typename
}
rankingType {
name
param
__typename
}
page
__typename
}
categoryTabList {
name
param
__typename
}
subcategoryList {
name
param
__typename
}
rankingTypeList {
name
param
__typename
}
displayAd {
...DisplayAd
__typename
}
__typename
}
... on StaticLandingGenreSection {
isEnd
totalCount
param {
categoryUid
subcategory {
name
param
__typename
}
sortType {
name
param
__typename
}
page
__typename
}
subcategoryList {
name
param
__typename
}
sortTypeList {
name
param
__typename
}
displayAd {
...DisplayAd
__typename
}
__typename
}
... on StaticLandingFreeSeriesSection {
isEnd
totalCount
param {
categoryUid
tab {
name
param
__typename
}
page
__typename
}
tabList {
name
param
__typename
}
__typename
}
... on StaticLandingEventSection {
isEnd
totalCount
param {
categoryUid
page
__typename
}
categoryTabList {
name
param
__typename
}
__typename
}
... on StaticLandingOriginalSection {
isEnd
totalCount
originalCount
param {
categoryUid
subcategory {
name
param
__typename
}
sortType {
name
param
__typename
}
isComplete
page
__typename
}
subcategoryList {
name
param
__typename
}
sortTypeList {
name
param
__typename
}
recommendItemList {
...Item
__typename
}
__typename
}
groups {
...Group
__typename
}
}
fragment DisplayAd on DisplayAd {
sectionUid
bannerUid
treviUid
momentUid
}
fragment Item on Item {
id
type
...BannerItem
...OnAirItem
...CardViewItem
...CleanViewItem
... on DisplayAdItem {
displayAd {
...DisplayAd
__typename
}
__typename
}
...PosterViewItem
...StrategyViewItem
...RankingListViewItem
...NormalListViewItem
...MoreItem
...EventBannerItem
}
fragment BannerItem on BannerItem {
bannerType
bannerViewType
thumbnail
videoUrl
badgeList
statusBadge
titleImage
title
metaList
caption
scheme
seriesId
eventLog {
...EventLogFragment
__typename
}
moreButton {
...MoreButtonFragment
__typename
}
}
fragment EventLogFragment on EventLog {
fromGraphql
click {
layer1
layer2
setnum
ordnum
copy
imp_id
imp_provider
__typename
}
eventMeta {
id
name
subcategory
category
series
provider
series_id
type
__typename
}
viewimp_contents {
type
name
id
imp_area_ordnum
imp_id
imp_provider
imp_type
layer1
layer2
__typename
}
customProps {
landing_path
view_type
toros_imp_id
toros_file_hash_key
toros_event_meta_id
content_cnt
event_series_id
event_ticket_type
play_url
banner_uid
__typename
}
}
fragment MoreButtonFragment on MoreButton {
type
scheme
title
}
fragment OnAirItem on OnAirItem {
thumbnail
videoUrl
titleImage
title
subtitleList
caption
scheme
}
fragment CardViewItem on CardViewItem {
title
thumbnail
titleImage
scheme
badgeList
ageGradeBadge
statusBadge
ageGrade
selfCensorship
torosImgId
torosFileHashKey
subtitleList
caption
eventLog {
...EventLogFragment
__typename
}
}
fragment CleanViewItem on CleanViewItem {
id
type
showPlayerIcon
scheme
title
thumbnail
badgeList
ageGradeBadge
statusBadge
subtitleList
rank
torosFileHashKey
torosImgId
ageGrade
selfCensorship
eventLog {
...EventLogFragment
__typename
}
}
fragment PosterViewItem on PosterViewItem {
id
type
showPlayerIcon
scheme
title
thumbnail
badgeList
ageGradeBadge
statusBadge
subtitleList
rank
torosFileHashKey
torosImgId
ageGrade
selfCensorship
eventLog {
...EventLogFragment
__typename
}
seriesId
}
fragment StrategyViewItem on StrategyViewItem {
id
title
count
scheme
}
fragment RankingListViewItem on RankingListViewItem {
title
thumbnail
badgeList
ageGradeBadge
statusBadge
ageGrade
selfCensorship
metaList
descriptionList
scheme
torosImgId
torosFileHashKey
rank
eventLog {
...EventLogFragment
__typename
}
}
fragment NormalListViewItem on NormalListViewItem {
id
type
ticketUid
thumbnail
badgeList
ageGradeBadge
statusBadge
ageGrade
isAlaramOn
row1
row2
row3 {
id
metaList
__typename
}
row4
row5
scheme
continueScheme
nextProductScheme
continueData {
...ContinueInfoFragment
__typename
}
torosImpId
torosFileHashKey
seriesId
isCheckMode
isChecked
isReceived
showPlayerIcon
rank
isSingle
singleSlideType
ageGrade
selfCensorship
eventLog {
...EventLogFragment
__typename
}
giftEventLog {
...EventLogFragment
__typename
}
}
fragment ContinueInfoFragment on ContinueInfo {
title
isFree
productId
lastReadProductId
scheme
continueProductType
hasNewSingle
hasUnreadSingle
}
fragment MoreItem on MoreItem {
id
scheme
title
}
fragment EventBannerItem on EventBannerItem {
bannerType
thumbnail
videoUrl
titleImage
title
subtitleList
caption
scheme
eventLog {
...EventLogFragment
__typename
}
}
fragment Group on Group {
id
... on ListViewGroup {
meta {
title
count
__typename
}
__typename
}
type
dataKey
groups {
...GroupInGroup
__typename
}
items {
...Item
__typename
}
}
fragment GroupInGroup on Group {
id
type
dataKey
items {
...Item
__typename
}
... on ListViewGroup {
meta {
title
count
__typename
}
__typename
}
}
'''
start_page = 591
last_page = 600
wb = openpyxl.Workbook()
ws = wb.active
ws.append([
'series_id', 'series_title', 'category', 'subs_cnt', 'age_limit'
])
error_page = []
for page in tqdm(range(start_page, last_page + 1)):
try:
time.sleep(1)
variables = {
'param': {
'categoryUid': 10,
'page' : page,
'sortType': 'update',
'subcategoryUid': '0'
},
'sectionId': 'static-landing-Genre-section-Layout-10-0-update'
}
response = requests.post(
url,
json={
'query': query,
'variables': variables,
}
)
data = response.json()
contents = data['data']['staticLandingGenreSection']['groups'][0]['items']
for content in contents:
series_id = content['seriesId']
series_title = content['title']
category = content['eventLog']['eventMeta']['subcategory']
subs_cnt = content['subtitleList'][0]
age_limit = content['ageGrade']
# print(
# series_id, series_title, category, subs_cnt, age_limit
# )
ws.append([
series_id, series_title, category, subs_cnt, age_limit
])
except:
print(page, response, data)
error_page.append(page)
wb.save('카카오페이지_전체작품_{}.xlsx'.format(start_page))
print(error_page)
## 완결작
url = 'https://page.kakao.com/graphql'
query = '''
query SearchKeyword($input: SearchKeywordInput!) {
searchKeyword(searchKeywordInput: $input) {
id
list {
...NormalListViewItem
}
total
isEnd
keyword
sortOptionList {
...SortOption
}
selectedSortOption {
...SortOption
}
categoryOptionList {
...SortOption
}
selectedCategoryOption {
...SortOption
}
showOnlyComplete
page
}
}
fragment NormalListViewItem on NormalListViewItem {
id
type
ticketUid
thumbnail
badgeList
ageGradeBadge
statusBadge
ageGrade
isAlaramOn
row1
row2
row3 {
id
metaList
}
row4
row5
scheme
continueScheme
nextProductScheme
continueData {
...ContinueInfoFragment
}
torosImpId
torosFileHashKey
seriesId
isCheckMode
isChecked
isReceived
showPlayerIcon
rank
isSingle
singleSlideType
ageGrade
selfCensorship
eventLog {
...EventLogFragment
}
giftEventLog {
...EventLogFragment
}
}
fragment ContinueInfoFragment on ContinueInfo {
title
isFree
productId
lastReadProductId
scheme
continueProductType
hasNewSingle
hasUnreadSingle
}
fragment EventLogFragment on EventLog {
fromGraphql
click {
layer1
layer2
setnum
ordnum
copy
imp_id
imp_provider
}
eventMeta {
id
name
subcategory
category
series
provider
series_id
type
}
viewimp_contents {
type
name
id
imp_area_ordnum
imp_id
imp_provider
imp_type
layer1
layer2
}
customProps {
landing_path
view_type
toros_imp_id
toros_file_hash_key
toros_event_meta_id
content_cnt
event_series_id
event_ticket_type
play_url
banner_uid
}
}
fragment SortOption on SortOption {
id
name
param
}
'''
start_page = 1
last_page = 1
wb = openpyxl.Workbook()
ws = wb.active
ws.append([
'serise_id', 'series_title', 'genre', 'author', 'age_limit', 'views', 'last_update_date', 'status'
])
error_page = []
for page in tqdm(range(start_page, last_page + 1)):
try:
time.sleep(1)
variables = {
'input': {
'categoryUid': '10',
'page' : page,
'sortType': 'Latest',
'keyword': '.',
'showOnlyComplete': False
}
}
response = requests.post(
url,
json={
'query': query,
'variables': variables,
}
)
data = response.json()
contents = data['data']['searchKeyword']['list']
for content in contents:
serise_id = content['eventLog']['eventMeta']['series_id']
series_title = content['row1']
genre , author = content['row2']
age_limit = content['ageGrade']
views, last_update_date, status = content['row3']['metaList']
# print(
# serise_id, series_title, genre , author, age_limit, views, last_update_date, status
# )
ws.append([
serise_id, series_title, genre , author, age_limit, views, last_update_date, status
])
except:
print(page, response)
error_page.append(page)
wb.save('카카오페이지_완결작_{}.xlsx'.format(start_page))
print(error_page)
반응형
'파이썬' 카테고리의 다른 글
python 웹툰 크롤링 시리즈2: 카카오웹툰 (1) | 2023.03.18 |
---|---|
python 웹툰 크롤링 시리즈1: 네이버웹툰 (3) | 2023.03.17 |
tqdm 사용법 파이썬 (0) | 2022.06.15 |
jupyter lab에서 ipython-sql로 DB 접속하기(SQL Server, MySQL, postgreSQL) (0) | 2021.08.25 |
파이썬 기초 통계분석 (0) | 2020.10.20 |