파이썬

python 웹툰 크롤링 시리즈3: 카카오페이지

박범준2 2023. 3. 19. 08:00
반응형

웹툰 크롤링 시리즈 마지막 편은 카카오페이지이다

카카페의 전체 작품의 시리즈ID, 작품명, 장르, 작가, 연령가, 조회수, 마지막연재일, 연재상태 정보를 가져오는 코드이다

카카페는 GraphQL을 통해 데이터를 요청해야해서 코드가 길다

 

 

import openpyxl
import requests
from tqdm import tqdm
import time


## 전체작품
url = 'https://page.kakao.com/graphql'
query = '''query staticLandingGenreSection($sectionId: ID!, $param: StaticLandingGenreParamInput!) {
  staticLandingGenreSection(sectionId: $sectionId, param: $param) {
    ...Section
    __typename
  }
}

fragment Section on Section {
  id
  uid
  type
  title
  ... on DependOnLoggedInSection {
    loggedInTitle
    loggedInScheme
    __typename
  }
  ... on SchemeSection {
    scheme
    __typename
  }
  ... on MetaInfoTypeSection {
    metaInfoType
    __typename
  }
  ... on TabSection {
    sectionMainTabList {
      uid
      title
      isSelected
      scheme
      additionalString
      subTabList {
        uid
        title
        isSelected
        groupId
        __typename
      }
      __typename
    }
    __typename
  }
  ... on ThemeKeywordSection {
    themeKeywordList {
      uid
      title
      scheme
      __typename
    }
    __typename
  }
  ... on StaticLandingDayOfWeekSection {
    isEnd
    totalCount
    displayAd {
      sectionUid
      bannerUid
      treviUid
      momentUid
      __typename
    }
    param {
      categoryUid
      businessModel {
        name
        param
        __typename
      }
      subcategory {
        name
        param
        __typename
      }
      dayTab {
        name
        param
        __typename
      }
      page
      size
      __typename
    }
    businessModelList {
      name
      param
      __typename
    }
    subcategoryList {
      name
      param
      __typename
    }
    dayTabList {
      name
      param
      __typename
    }
    __typename
  }
  ... on StaticLandingTodayNewSection {
    totalCount
    param {
      categoryUid
      subcategory {
        name
        param
        __typename
      }
      __typename
    }
    categoryTabList {
      name
      param
      __typename
    }
    subcategoryList {
      name
      param
      __typename
    }
    __typename
  }
  ... on StaticLandingTodayUpSection {
    isEnd
    totalCount
    param {
      categoryUid
      subcategory {
        name
        param
        __typename
      }
      page
      __typename
    }
    categoryTabList {
      name
      param
      __typename
    }
    subcategoryList {
      name
      param
      __typename
    }
    __typename
  }
  ... on StaticLandingRankingSection {
    isEnd
    rankingTime
    totalCount
    param {
      categoryUid
      subcategory {
        name
        param
        __typename
      }
      rankingType {
        name
        param
        __typename
      }
      page
      __typename
    }
    categoryTabList {
      name
      param
      __typename
    }
    subcategoryList {
      name
      param
      __typename
    }
    rankingTypeList {
      name
      param
      __typename
    }
    displayAd {
      ...DisplayAd
      __typename
    }
    __typename
  }
  ... on StaticLandingGenreSection {
    isEnd
    totalCount
    param {
      categoryUid
      subcategory {
        name
        param
        __typename
      }
      sortType {
        name
        param
        __typename
      }
      page
      __typename
    }
    subcategoryList {
      name
      param
      __typename
    }
    sortTypeList {
      name
      param
      __typename
    }
    displayAd {
      ...DisplayAd
      __typename
    }
    __typename
  }
  ... on StaticLandingFreeSeriesSection {
    isEnd
    totalCount
    param {
      categoryUid
      tab {
        name
        param
        __typename
      }
      page
      __typename
    }
    tabList {
      name
      param
      __typename
    }
    __typename
  }
  ... on StaticLandingEventSection {
    isEnd
    totalCount
    param {
      categoryUid
      page
      __typename
    }
    categoryTabList {
      name
      param
      __typename
    }
    __typename
  }
  ... on StaticLandingOriginalSection {
    isEnd
    totalCount
    originalCount
    param {
      categoryUid
      subcategory {
        name
        param
        __typename
      }
      sortType {
        name
        param
        __typename
      }
      isComplete
      page
      __typename
    }
    subcategoryList {
      name
      param
      __typename
    }
    sortTypeList {
      name
      param
      __typename
    }
    recommendItemList {
      ...Item
      __typename
    }
    __typename
  }
  groups {
    ...Group
    __typename
  }
}

fragment DisplayAd on DisplayAd {
  sectionUid
  bannerUid
  treviUid
  momentUid
}

fragment Item on Item {
  id
  type
  ...BannerItem
  ...OnAirItem
  ...CardViewItem
  ...CleanViewItem
  ... on DisplayAdItem {
    displayAd {
      ...DisplayAd
      __typename
    }
    __typename
  }
  ...PosterViewItem
  ...StrategyViewItem
  ...RankingListViewItem
  ...NormalListViewItem
  ...MoreItem
  ...EventBannerItem
}

fragment BannerItem on BannerItem {
  bannerType
  bannerViewType
  thumbnail
  videoUrl
  badgeList
  statusBadge
  titleImage
  title
  metaList
  caption
  scheme
  seriesId
  eventLog {
    ...EventLogFragment
    __typename
  }
  moreButton {
    ...MoreButtonFragment
    __typename
  }
}

fragment EventLogFragment on EventLog {
  fromGraphql
  click {
    layer1
    layer2
    setnum
    ordnum
    copy
    imp_id
    imp_provider
    __typename
  }
  eventMeta {
    id
    name
    subcategory
    category
    series
    provider
    series_id
    type
    __typename
  }
  viewimp_contents {
    type
    name
    id
    imp_area_ordnum
    imp_id
    imp_provider
    imp_type
    layer1
    layer2
    __typename
  }
  customProps {
    landing_path
    view_type
    toros_imp_id
    toros_file_hash_key
    toros_event_meta_id
    content_cnt
    event_series_id
    event_ticket_type
    play_url
    banner_uid
    __typename
  }
}

fragment MoreButtonFragment on MoreButton {
  type
  scheme
  title
}

fragment OnAirItem on OnAirItem {
  thumbnail
  videoUrl
  titleImage
  title
  subtitleList
  caption
  scheme
}

fragment CardViewItem on CardViewItem {
  title
  thumbnail
  titleImage
  scheme
  badgeList
  ageGradeBadge
  statusBadge
  ageGrade
  selfCensorship
  torosImgId
  torosFileHashKey
  subtitleList
  caption
  eventLog {
    ...EventLogFragment
    __typename
  }
}

fragment CleanViewItem on CleanViewItem {
  id
  type
  showPlayerIcon
  scheme
  title
  thumbnail
  badgeList
  ageGradeBadge
  statusBadge
  subtitleList
  rank
  torosFileHashKey
  torosImgId
  ageGrade
  selfCensorship
  eventLog {
    ...EventLogFragment
    __typename
  }
}

fragment PosterViewItem on PosterViewItem {
  id
  type
  showPlayerIcon
  scheme
  title
  thumbnail
  badgeList
  ageGradeBadge
  statusBadge
  subtitleList
  rank
  torosFileHashKey
  torosImgId
  ageGrade
  selfCensorship
  eventLog {
    ...EventLogFragment
    __typename
  }
  seriesId
}

fragment StrategyViewItem on StrategyViewItem {
  id
  title
  count
  scheme
}

fragment RankingListViewItem on RankingListViewItem {
  title
  thumbnail
  badgeList
  ageGradeBadge
  statusBadge
  ageGrade
  selfCensorship
  metaList
  descriptionList
  scheme
  torosImgId
  torosFileHashKey
  rank
  eventLog {
    ...EventLogFragment
    __typename
  }
}

fragment NormalListViewItem on NormalListViewItem {
  id
  type
  ticketUid
  thumbnail
  badgeList
  ageGradeBadge
  statusBadge
  ageGrade
  isAlaramOn
  row1
  row2
  row3 {
    id
    metaList
    __typename
  }
  row4
  row5
  scheme
  continueScheme
  nextProductScheme
  continueData {
    ...ContinueInfoFragment
    __typename
  }
  torosImpId
  torosFileHashKey
  seriesId
  isCheckMode
  isChecked
  isReceived
  showPlayerIcon
  rank
  isSingle
  singleSlideType
  ageGrade
  selfCensorship
  eventLog {
    ...EventLogFragment
    __typename
  }
  giftEventLog {
    ...EventLogFragment
    __typename
  }
}

fragment ContinueInfoFragment on ContinueInfo {
  title
  isFree
  productId
  lastReadProductId
  scheme
  continueProductType
  hasNewSingle
  hasUnreadSingle
}

fragment MoreItem on MoreItem {
  id
  scheme
  title
}

fragment EventBannerItem on EventBannerItem {
  bannerType
  thumbnail
  videoUrl
  titleImage
  title
  subtitleList
  caption
  scheme
  eventLog {
    ...EventLogFragment
    __typename
  }
}

fragment Group on Group {
  id
  ... on ListViewGroup {
    meta {
      title
      count
      __typename
    }
    __typename
  }
  type
  dataKey
  groups {
    ...GroupInGroup
    __typename
  }
  items {
    ...Item
    __typename
  }
}

fragment GroupInGroup on Group {
  id
  type
  dataKey
  items {
    ...Item
    __typename
  }
  ... on ListViewGroup {
    meta {
      title
      count
      __typename
    }
    __typename
  }
}


'''

start_page = 591
last_page = 600

wb = openpyxl.Workbook()
ws = wb.active
ws.append([
    'series_id', 'series_title', 'category', 'subs_cnt', 'age_limit'
])

error_page = []

for page in tqdm(range(start_page, last_page + 1)):
    try:
        time.sleep(1)

        variables = {
            'param': {
                'categoryUid': 10,
                'page' : page,
                'sortType': 'update',
                'subcategoryUid': '0'
            },
            'sectionId': 'static-landing-Genre-section-Layout-10-0-update'
        }


        response = requests.post(
            url,
            json={
                'query': query,
                'variables': variables,
            }
        )

        data = response.json()

        contents = data['data']['staticLandingGenreSection']['groups'][0]['items']

        for content in contents:

            series_id = content['seriesId']
            series_title = content['title']
            category = content['eventLog']['eventMeta']['subcategory']
            subs_cnt = content['subtitleList'][0]
            age_limit = content['ageGrade']

            # print(
            #     series_id, series_title, category, subs_cnt, age_limit
            # )

            ws.append([
                series_id, series_title, category, subs_cnt, age_limit
            ])
    except:
        print(page, response, data)
        error_page.append(page)

wb.save('카카오페이지_전체작품_{}.xlsx'.format(start_page))
print(error_page)


## 완결작
url = 'https://page.kakao.com/graphql'
query = '''
query SearchKeyword($input: SearchKeywordInput!) {
  searchKeyword(searchKeywordInput: $input) {
    id
    list {
      ...NormalListViewItem
    }
    total
    isEnd
    keyword
    sortOptionList {
      ...SortOption
    }
    selectedSortOption {
      ...SortOption
    }
    categoryOptionList {
      ...SortOption
    }
    selectedCategoryOption {
      ...SortOption
    }
    showOnlyComplete
    page
  }
}
    
    fragment NormalListViewItem on NormalListViewItem {
  id
  type
  ticketUid
  thumbnail
  badgeList
  ageGradeBadge
  statusBadge
  ageGrade
  isAlaramOn
  row1
  row2
  row3 {
    id
    metaList
  }
  row4
  row5
  scheme
  continueScheme
  nextProductScheme
  continueData {
    ...ContinueInfoFragment
  }
  torosImpId
  torosFileHashKey
  seriesId
  isCheckMode
  isChecked
  isReceived
  showPlayerIcon
  rank
  isSingle
  singleSlideType
  ageGrade
  selfCensorship
  eventLog {
    ...EventLogFragment
  }
  giftEventLog {
    ...EventLogFragment
  }
}
    
    fragment ContinueInfoFragment on ContinueInfo {
  title
  isFree
  productId
  lastReadProductId
  scheme
  continueProductType
  hasNewSingle
  hasUnreadSingle
}
    

    fragment EventLogFragment on EventLog {
  fromGraphql
  click {
    layer1
    layer2
    setnum
    ordnum
    copy
    imp_id
    imp_provider
  }
  eventMeta {
    id
    name
    subcategory
    category
    series
    provider
    series_id
    type
  }
  viewimp_contents {
    type
    name
    id
    imp_area_ordnum
    imp_id
    imp_provider
    imp_type
    layer1
    layer2
  }
  customProps {
    landing_path
    view_type
    toros_imp_id
    toros_file_hash_key
    toros_event_meta_id
    content_cnt
    event_series_id
    event_ticket_type
    play_url
    banner_uid
  }
}
    

    fragment SortOption on SortOption {
  id
  name
  param
}


'''

start_page = 1
last_page = 1

wb = openpyxl.Workbook()
ws = wb.active
ws.append([
    'serise_id', 'series_title', 'genre', 'author', 'age_limit', 'views', 'last_update_date', 'status'
])

error_page = []

for page in tqdm(range(start_page, last_page + 1)):
    try:
        time.sleep(1)

        variables = {
            'input': {
                'categoryUid': '10',
                'page' : page,
                'sortType': 'Latest',
                'keyword': '.',
                'showOnlyComplete': False
            }
        }


        response = requests.post(
            url,
            json={
                'query': query,
                'variables': variables,
            }
        )

        data = response.json()

        contents = data['data']['searchKeyword']['list']

        for content in contents:

            serise_id = content['eventLog']['eventMeta']['series_id']
            series_title = content['row1']
            genre , author = content['row2']
            age_limit = content['ageGrade']
            views, last_update_date, status =  content['row3']['metaList']

            # print(
            #     serise_id, series_title, genre , author, age_limit, views, last_update_date, status
            # )

            ws.append([
                serise_id, series_title, genre , author, age_limit, views, last_update_date, status
            ])
    except:
        print(page, response)
        error_page.append(page)

wb.save('카카오페이지_완결작_{}.xlsx'.format(start_page))
print(error_page)
반응형