| S9a8m added a comment. |
Current program running as so:
# -*- coding: utf-8 -*- """ Created on Fri Oct 26 14:49:17 2018
@author: Sam
"""
#Scrape chemical information from PubChem, using tag <h1 class="h4">
import requests
import pandas as pd
import re
element=["H","He","Li","Be","B","C","N","O","F","Ne","Na","Mg","Al","Si","P","S","Cl","K","Ar","Ca","Sc","Ti","V","Cr","Mn","Fe","Ni","Co","Cu","Zn","Ga","Ge","As","Se","Br","Kr","Rb","Sr","Y","Zr","Nb","Mo","Tc","Ru","Rh","Pd","Ag","Cd","In","Sn","Sb","I","Te","Xe","Cs","Ba","La","Ce","Pr","Nd","Pm","Sm","Eu","Gd","Tb","Dy","Ho","Er","Tm","Yb","Lu","Hf","Ta","W","Re","Os","Ir","Pt","Au","Hg","Tl","Pb","Bi","Po","At","Rn","Fr","Ra","Ac","Pa","Th","Np","U","Am","Pu","Cm","Bk","Cf","Es","Fm","Md","No","Rf","Lr","Db","Bh","Sg","Mt","Hs"]
el_mass=[1.008,4.003,6.941,9.012,10.811,12.011,14.007,15.999,18.998,20.18,22.99,24.305,26.982,28.086,30.974,32.065,35.453,39.098,39.948,40.078,44.956,47.867,50.942,51.996,54.938,55.845,58.693,58.933,63.546,65.39,69.723,72.64,74.922,78.96,79.904,83.8,85.468,87.62,88.906,91.224,92.906,95.94,98,101.07,102.906,106.42,107.868,112.411,114.818,118.71,121.76,126.905,127.6,131.293,132.906,137.327,138.906,140.116,140.908,144.24,145,150.36,151.964,157.25,158.925,162.5,164.93,167.259,168.934,173.04,174.967]
undef=[]
def calcmolmass(chemformula):
wocharge=chemformula.split("+")[0].split("-")[0]
subparts = re.findall('[A-Z][^A-Z]*', wocharge)
firstdigits = [re.search("\d", sp) for sp in subparts]
firstdigits = [fd.start() if fd is not None else None for fd in firstdigits]
nums = [float(sp[fd:]) if fd is not None else 1 for sp, fd in zip(subparts, firstdigits)]
els = [sp[:fd] if fd is not None else sp for sp, fd in zip(subparts, firstdigits)]
mass = sum(el_mass[element.index(els[n])]*nums[n] for n in range(len(els)))
return float("{:.3f}".format(mass))
chemicals=[]
for n in range(1,1000):
url=""
r = requests.get(url)
body=r.text
try:
title1 = body.split('<title>')[1]
t = title1.split('</title>')[0]
title = t.split(' | ')
chem_n = title[0]
chem_f = title[1].split(' - ')[0]
chem_mr=calcmolmass(chem_f)
chemicals.append([chem_n, chem_f, chem_mr])
except:
title1 = body.split('<title>')[1]
t = title1.split('</title>')[0]
title = t.split(' | ')
chem_n = title[0]
chem_f = title[1].split(' - ')[0]
print("{:} - ({:})".format(url, chem_f))
col_name=["Name", "Formulae", "Mr"]
df = pd.DataFrame(chemicals, columns=col_name)
df.to_csv('chemical_database.csv', sep=',', index=False)
print(df)Bit chunky but WORKING - now need to upload csv to WikiData.
TASK DETAIL
EMAIL PREFERENCES
To: S9a8m
Cc: BorDeh, Vemonet, Freddytuxworth, Husky, Laffano, S9a8m, A_ka_es, Teffubud, Dinadineke, Arybolab, Dja, Elvalente, Nandana, tabish.shaikh91, Lahi, Gq86, GoranSMilovanovic, Soteriaspace, Jayprakash12345, JakeTheDeveloper, QZanden, merbst, LawExplorer, DDJJ, Harmonia_Amanda, Spinster, Jane023, Wikidata-bugs, aude, TheDJ, Mbch331, valhallasw
Cc: BorDeh, Vemonet, Freddytuxworth, Husky, Laffano, S9a8m, A_ka_es, Teffubud, Dinadineke, Arybolab, Dja, Elvalente, Nandana, tabish.shaikh91, Lahi, Gq86, GoranSMilovanovic, Soteriaspace, Jayprakash12345, JakeTheDeveloper, QZanden, merbst, LawExplorer, DDJJ, Harmonia_Amanda, Spinster, Jane023, Wikidata-bugs, aude, TheDJ, Mbch331, valhallasw
_______________________________________________ Wikidata-bugs mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/wikidata-bugs
