파이썬, 구글 코랩, 코로나 크롤링, 스크래핑, 시도별 확진자 현황

https://github.com/gyunseul9/coronagathering


import os

import re

import requests

import urllib.request

import pandas as pd

from datetime import datetime

from bs4 import BeautifulSoup

from google.colab import drive


c=drive.mount('/content/drive')
CSV_URI = 'drive/My Drive/development/test/'


def overlap_param(csv_udate, udate):

if csv_udate == udate:

print('Overlap Record')

return 1

else:

print('Add Record')

return 0


def make_date(string):

datetime.today()

_year = datetime.today().strftime("%Y")


numbers = re.findall('\d+', string)

_month = numbers[0]

_day = numbers[1]

_hour = numbers[2]


return _year+_month+_day+_hour


def remove_keyword(string):

string = string.replace('(', '')

string = string.replace(')', '')


return string


def write_csv(df):

savename = CSV_URI+'coronastatus.csv'


tmp = []

tmp = savename.split('/')

tmp2 = tmp[len(tmp)-1]


if os.path.exists(savename):

print('Exist CSV', tmp2)

df_read = pd.read_csv(savename, header=None)


last_row = df_read.tail(1)

csv_udate = last_row.iloc[:,0]

result = overlap_param(int(csv_udate.values[0]),int(df['udate'].values[0]))

else:

print('Does not exist CSV', tmp2)

result = 0


if result == 0:

df.to_csv(savename, header=False, index=False, mode='a', encoding='utf-8-sig')


def scrappy(soup):


dic_corona= {}


udate, area, num, before = [], [], [], []


for i in range(0, 16):

contents = soup.select('div.wrap.nj div.mainlive_container div.container div div.liveboard_layout div.live_right.main_box_toggle')

# print(contents)


try:

tmp = contents[0].select('h2 a span.livedate')[0].text.strip()

udate.append(make_date(tmp))

# print(make_date(tmp))

except ValueError:

udate.append('udate')


try:

tmp = contents[0].select('div.regional_patient_status_A div.rpsa_map div.rpsam_graph div#main_maplayout button')[i]

tmp = tmp.select('span.name')[0].text.strip()

area.append(tmp)

# print(tmp)

except ValueError:

area.append('area')


try:

tmp = contents[0].select('div.regional_patient_status_A div.rpsa_map div.rpsam_graph div#main_maplayout button')[i]

tmp = tmp.select('span.num')[0].text.strip()

num.append(tmp)

# print(tmp)

except ValueError:

num.append('num')


try:

tmp = contents[0].select('div.regional_patient_status_A div.rpsa_map div.rpsam_graph div#main_maplayout button')[i]

tmp = tmp.select('span.before')[0].text.strip()

before.append(remove_keyword(tmp))

# print(remove_keyword(tmp))

except ValueError:

before.append('before')


dic_corona['udate'] = udate

dic_corona['area'] = area

dic_corona['num'] = num

dic_corona['before'] = before


df_corona = pd.DataFrame(dic_corona)


write_csv(df_corona)


url = 'http://ncov.mohw.go.kr/'


resp = requests.get(url)

soup = BeautifulSoup(resp.text, 'lxml')


scrappy(soup)

댓글

이 블로그의 인기 게시물

[LINUX] CentOS 부팅시 오류 : UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY

[MSSQL] 데이터베이스가 사용 중이어서 배타적으로 액서스할 수 없습니다

구글코랩) 안전Dream 실종아동 등 검색 오픈API 소스를 공유합니다. (구글드라이브연동, 이미지 수집 소스)