/ LECTURE

Python - Scrapping

이 페이지는 다음에 대한 공부 기록입니다

JAVA(자바), Python(파이썬) 기반의

AI 활용 응용 소프트웨어 개발자 양성 과정

2021.11.10. ~ 2022.05.18.

찾으시는 정보가 있으시다면
주제별reference를 이용하시거나
우측 상단에 있는 검색기능을 이용해주세요

88일차 수업

데이터 스크래핑 후 시각화

주제 자유 : 당근마켓 아이폰 중고 시세

main.py

import import_ipynb
import carrot_func as func
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='AppleGothic')

driver = func.set_chrome_driver()

driver.get('https://www.daangn.com/')
KEYWORD = '아이폰'
keyword_list = [KEYWORD + str(search) for search in range(6, 14)]

results, frees = func.carrot(driver, keyword_list)

carrot_func.py

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from tqdm.notebook import tqdm

def set_chrome_driver():
    chrome_options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

def search_keyword(driver, keyword):
    search_bar = driver.find_element(by=By.NAME, value='header-search-input')
    search_bar.clear()
    search_bar.send_keys(keyword)
    search_bar.send_keys(Keys.ENTER)

def click_btn(driver, page):
    cnt = 0
    for i in tqdm(range(1, page + 1)):
        try:
            WebDriverWait(driver, 2).until(
                expected_conditions.presence_of_all_elements_located((By.CLASS_NAME, 'more-text')))
            more_btn = driver.find_element(by=By.CLASS_NAME, value='more-btn')
            more_btn.click()
            cnt = 0
        except:
            if cnt%2 == 0:
                driver.back()
                cnt += 1
            else:
                driver.forward()

def select_data(container, keyword, results, frees):
    for product in tqdm(container):
        temp = product.find_element(by=By.CLASS_NAME, value='article-info')
        title = temp.find_element(by=By.CLASS_NAME, value='article-title').text.strip()
        location = temp.find_element(by=By.CLASS_NAME, value='article-region-name').text.strip().split()[0]
        price = temp.find_element(by=By.CLASS_NAME, value='article-price').text.strip().replace('원', '').replace(',','')
        try:
            if title.find('케이스') != -1:
                raise Exception
            if price != '나눔':
                price = int(price)
                if (price > 40000) & (price < 3000000):
                    results.append({'title': keyword, 'location': location, 'price': price})
            else:
                frees.append({'title': keyword, 'location': location, 'price': price})
        except:
            pass

def carrot(driver, keyword_list):
    results=[]
    frees = []
    for keyword in tqdm(keyword_list):
        search_keyword(driver,keyword)

        more_btn = WebDriverWait(driver, 2).until(
        expected_conditions.presence_of_element_located((By.CLASS_NAME, 'more-btn')))
        page = int(more_btn.get_attribute('data-total-pages'))

        click_btn(driver,page)

        container = driver.find_elements(by=By.CLASS_NAME, value='flea-market-article')
        select_data(container, keyword, results, frees)
    return results, frees

Save Excel

data = pd.DataFrame(results)
data.columns = ['상품명', '지역', '가격']
data.to_excel(f'carrot_{KEYWORD}.xlsx', index=False)

data = pd.DataFrame(frees)
data.columns = ['상품명', '지역', '가격']
data.to_excel(f'carrot_{KEYWORD}_free.xlsx', index=False)

시각화

data = pd.read_excel('carrot_아이폰.xlsx')

plt.figure(figsize=(12, 8))
plt.title('기종별 시세')
sns.boxplot(x='상품명', y='가격', data=data)

chart

plt.figure(figsize=(12, 9))
plt.title('매물 등록 수')
sns.countplot(x='상품명', data=data)

chart

iris = pd.melt(data, '상품명', var_name='지역')
f, ax = plt.subplots(figsize=(12, 20))
sns.despine(bottom=True, left=True)

sns.stripplot(x='가격', y='지역', hue='상품명',
              data=data, dodge=True, alpha=.25, zorder=1)

sns.pointplot(x='가격', y='지역', hue='상품명',
              data=data, dodge=.8 - .8 / 3,
              join=False, palette='dark',
              markers='d', scale=.75, ci=None)

plt.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='AppleGothic')

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[7:], labels[8:], title='상품명',
          handletextpad=0, columnspacing=1,
          loc='lower right', ncol=3, frameon=True)

chart