파이썬, 구글 코랩, 코로나 크롤링, 스크래핑, 시도별 확진자 현황
https://github.com/gyunseul9/coronagathering
import os
import re
import requests
import urllib.request
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
def overlap_param(csv_udate, udate):
if csv_udate == udate:
print('Overlap Record')
return 1
else:
print('Add Record')
return 0
def make_date(string):
datetime.today()
_year = datetime.today().strftime("%Y")
numbers = re.findall('\d+', string)
_month = numbers[0]
_day = numbers[1]
_hour = numbers[2]
return _year+_month+_day+_hour
def remove_keyword(string):
string = string.replace('(', '')
string = string.replace(')', '')
return string
def write_csv(df):
savename = CSV_URI+'coronastatus.csv'
tmp = []
tmp = savename.split('/')
tmp2 = tmp[len(tmp)-1]
if os.path.exists(savename):
print('Exist CSV', tmp2)
df_read = pd.read_csv(savename, header=None)
last_row = df_read.tail(1)
csv_udate = last_row.iloc[:,0]
result = overlap_param(int(csv_udate.values[0]),int(df['udate'].values[0]))
else:
print('Does not exist CSV', tmp2)
result = 0
if result == 0:
df.to_csv(savename, header=False, index=False, mode='a', encoding='utf-8-sig')
def scrappy(soup):
dic_corona= {}
udate, area, num, before = [], [], [], []
for i in range(0, 16):
contents = soup.select('div.wrap.nj div.mainlive_container div.container div div.liveboard_layout div.live_right.main_box_toggle')
# print(contents)
try:
tmp = contents[0].select('h2 a span.livedate')[0].text.strip()
udate.append(make_date(tmp))
# print(make_date(tmp))
except ValueError:
udate.append('udate')
try:
tmp = contents[0].select('div.regional_patient_status_A div.rpsa_map div.rpsam_graph div#main_maplayout button')[i]
tmp = tmp.select('span.name')[0].text.strip()
area.append(tmp)
# print(tmp)
except ValueError:
area.append('area')
try:
tmp = contents[0].select('div.regional_patient_status_A div.rpsa_map div.rpsam_graph div#main_maplayout button')[i]
tmp = tmp.select('span.num')[0].text.strip()
num.append(tmp)
# print(tmp)
except ValueError:
num.append('num')
try:
tmp = contents[0].select('div.regional_patient_status_A div.rpsa_map div.rpsam_graph div#main_maplayout button')[i]
tmp = tmp.select('span.before')[0].text.strip()
before.append(remove_keyword(tmp))
# print(remove_keyword(tmp))
except ValueError:
before.append('before')
dic_corona['udate'] = udate
dic_corona['area'] = area
dic_corona['num'] = num
dic_corona['before'] = before
df_corona = pd.DataFrame(dic_corona)
write_csv(df_corona)
url = 'http://ncov.mohw.go.kr/'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
scrappy(soup)
댓글