-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Scrap.py
95 lines (82 loc) · 2.91 KB
/
Data Scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pickle
import re
from googlesearch import search
import warnings
warnings.filterwarnings("ignore")
import requests
from bs4 import BeautifulSoup
import time
# Fetch disease list from 'www.nhp.gov.in'
small_alpha = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
diseases=[]
for c in small_alpha:
URL = 'https://www.nhp.gov.in/disease-a-z/'+c
time.sleep(1)
page = requests.get(URL,verify=False)
soup = BeautifulSoup(page.content, 'html5lib')
all_diseases = soup.find('div', class_='all-disease')
for element in all_diseases.find_all('li'):
diseases.append(element.get_text().strip())
with open('list_diseaseNames.pkl', 'rb') as handle:
diseases2 = pickle.load(handle)
#print(len(diseases2))
#print(len(diseases))
#print(len(set(diseases).intersection(set(diseases2))))
a=set(diseases)
b=set(diseases2)
c=list(a.union(b))
c.sort()
#print(c)
# Search diseases on google, open wikipedia page and fetch symptom from infobox
dis_symp={}
# dis1=['anthrax']
for dis in c:
query = dis+' wikipedia'
# search "disease wilipedia" on google
for sr in search(query,tld="co.in",stop=10,pause=0.5):
# open wikipedia link
match=re.search(r'wikipedia',sr)
filled = 0
if match:
wiki = requests.get(sr,verify=False)
soup = BeautifulSoup(wiki.content, 'html5lib')
# Fetch HTML code for 'infobox'
info_table = soup.find("table", {"class":"infobox"})
if info_table is not None:
# Preprocess contents of infobox
for row in info_table.find_all("tr"):
data=row.find("th",{"scope":"row"})
if data is not None:
data=data.get_text()
if data=="Symptoms":
symptom=str(row.find("td"))
symptom = symptom.replace('.','')
symptom = symptom.replace(';',',')
symptom=re.sub(r'<b.*?/b>:',',',symptom) # Remove bold text
symptom=re.sub(r'<a.*?>','',symptom) # Remove hyperlink
symptom=re.sub(r'</a>','',symptom) # Remove hyperlink
symptom=re.sub(r'<[^<]+?>',', ',symptom) # All the tags
symptom=re.sub(r'\[.*\]','',symptom) # Remove citation text
symptom=' '.join([x for x in symptom.split() if x != ','])
dis_symp[dis]=symptom
# print(dis_symp[dis])
filled = 1
break
if filled==1:
break
#for key,value in dis_symp.items():
# print(key,':',value)
# Remove diseases that show duplicate symptoms list
temp_list=[]
tmp_dict=dict()
for key,value in dis_symp.items():
if value not in temp_list:
tmp_dict[key]=value
temp_list.append(value)
else:
print(key)
# Save the dictionary in PICKLE file
dis_symp = tmp_dict
print(len(dis_symp))
with open('final_dis_symp.pickle', 'wb') as handle:
pickle.dump(dis_symp, handle, protocol=pickle.HIGHEST_PROTOCOL)