Python - Scrapping
이 페이지는 다음에 대한 공부 기록입니다
JAVA(자바), Python(파이썬) 기반의
AI 활용 응용 소프트웨어 개발자 양성 과정
2021.11.10. ~ 2022.05.18.
찾으시는 정보가 있으시다면
주제별reference를 이용하시거나
우측 상단에 있는 검색기능을 이용해주세요
88일차 수업
데이터 스크래핑 후 시각화
주제 자유 : 당근마켓 아이폰 중고 시세
main.py
import import_ipynb
import carrot_func as func
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='AppleGothic')
driver = func.set_chrome_driver()
driver.get('https://www.daangn.com/')
KEYWORD = '아이폰'
keyword_list = [KEYWORD + str(search) for search in range(6, 14)]
results, frees = func.carrot(driver, keyword_list)
carrot_func.py
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from tqdm.notebook import tqdm
def set_chrome_driver():
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
return driver
def search_keyword(driver, keyword):
search_bar = driver.find_element(by=By.NAME, value='header-search-input')
search_bar.clear()
search_bar.send_keys(keyword)
search_bar.send_keys(Keys.ENTER)
def click_btn(driver, page):
cnt = 0
for i in tqdm(range(1, page + 1)):
try:
WebDriverWait(driver, 2).until(
expected_conditions.presence_of_all_elements_located((By.CLASS_NAME, 'more-text')))
more_btn = driver.find_element(by=By.CLASS_NAME, value='more-btn')
more_btn.click()
cnt = 0
except:
if cnt%2 == 0:
driver.back()
cnt += 1
else:
driver.forward()
def select_data(container, keyword, results, frees):
for product in tqdm(container):
temp = product.find_element(by=By.CLASS_NAME, value='article-info')
title = temp.find_element(by=By.CLASS_NAME, value='article-title').text.strip()
location = temp.find_element(by=By.CLASS_NAME, value='article-region-name').text.strip().split()[0]
price = temp.find_element(by=By.CLASS_NAME, value='article-price').text.strip().replace('원', '').replace(',','')
try:
if title.find('케이스') != -1:
raise Exception
if price != '나눔':
price = int(price)
if (price > 40000) & (price < 3000000):
results.append({'title': keyword, 'location': location, 'price': price})
else:
frees.append({'title': keyword, 'location': location, 'price': price})
except:
pass
def carrot(driver, keyword_list):
results=[]
frees = []
for keyword in tqdm(keyword_list):
search_keyword(driver,keyword)
more_btn = WebDriverWait(driver, 2).until(
expected_conditions.presence_of_element_located((By.CLASS_NAME, 'more-btn')))
page = int(more_btn.get_attribute('data-total-pages'))
click_btn(driver,page)
container = driver.find_elements(by=By.CLASS_NAME, value='flea-market-article')
select_data(container, keyword, results, frees)
return results, frees
Save Excel
data = pd.DataFrame(results)
data.columns = ['상품명', '지역', '가격']
data.to_excel(f'carrot_{KEYWORD}.xlsx', index=False)
data = pd.DataFrame(frees)
data.columns = ['상품명', '지역', '가격']
data.to_excel(f'carrot_{KEYWORD}_free.xlsx', index=False)
시각화
data = pd.read_excel('carrot_아이폰.xlsx')
plt.figure(figsize=(12, 8))
plt.title('기종별 시세')
sns.boxplot(x='상품명', y='가격', data=data)
plt.figure(figsize=(12, 9))
plt.title('매물 등록 수')
sns.countplot(x='상품명', data=data)
iris = pd.melt(data, '상품명', var_name='지역')
f, ax = plt.subplots(figsize=(12, 20))
sns.despine(bottom=True, left=True)
sns.stripplot(x='가격', y='지역', hue='상품명',
data=data, dodge=True, alpha=.25, zorder=1)
sns.pointplot(x='가격', y='지역', hue='상품명',
data=data, dodge=.8 - .8 / 3,
join=False, palette='dark',
markers='d', scale=.75, ci=None)
plt.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='AppleGothic')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[7:], labels[8:], title='상품명',
handletextpad=0, columnspacing=1,
loc='lower right', ncol=3, frameon=True)