On 11/10/2010 20:21, Noel O'Boyle wrote:
I went through a large dataset of PubChem 3D structures looking for implicit H failures (removing, then adding Hs). 1/3 of the failures are due to the following in atomtyp.txt:INTHYB [$([#6]([#8D1])[#8D1])] 2 #sp2 carbon This makes any C attached to two Os turn into sp2...even in geminal diols , for example, ClC(Cl)(Cl)C(O)O. According to wikipedia, these don't tend to last long but they are in PubChem (whether errors or not, I can't say). If it's commented out, then it works fine for these cases. Is there any reason for this rule (it seems to date from the early days)? Perhaps it's to correct ligand structures from the PDB where all examples of this indicate COO-? If so, maybe the PDB cases are better handled in the code using the molecular geometry...?
I have always regarded the implicit valency model as unsatisfactory, and maybe it has become a bit messed up over the years. Is it currently used for anything other than recognizing when a hydrogen could or should be added to an atom?
A simpler and more obvious model for this purpose has essentially a single IMPVAL for each charge state of the molecule. (Only if you are interested in radicals or hydrogen on the higher valency states of second row elements do you need another rule for each higher valence.) There is no need for any skilful fine tuning. It is more maintainable and will be faster because not so many SMARTS patterns need to be matched. Up to now, It has worked for everything I've tried (although this not very extensive), except with test_formula, where the fault is in a couple of erroneous results in formularesults.txt, which at least shows the old model was error-prone and needs some more tweaking of phosphate structures.
There may be other side effects I'm not aware of. Just before a release is not a good time to commit something like this (5 years ago would have been better), so I've just attached a patch (changes to 11 code lines), if you want to try it.
Chris
Index: data/atomtyp.txt =================================================================== --- data/atomtyp.txt (revision 4157) +++ data/atomtyp.txt (working copy) @@ -77,105 +77,63 @@ # #IMPLICIT VALENCE RULES # -#IMPVAL [#3,#11,#19,#37,#55,#87] 1 # Alkali metals -#IMPVAL [#4,#12,#20,#38,#56,#88] 2 # Alkaline earth - IMPVAL [#5] 3 # Generic boron -IMPVAL [$([#6+]=*)] 2 -IMPVAL [$([#6+]#*)] 1 # just for InChI C2 -IMPVAL [#6^3] 4 -IMPVAL [#6^2] 3 -IMPVAL [#6^1] 2 -IMPVAL [#6^3-] 3 -IMPVAL [#6^2-] 2 -IMPVAL [#6^1-] 1 # isocyanides, CO (valence=1, bond order=3) -IMPVAL [c+] 2 +IMPVAL [#6] 4 # Generic carbon +IMPVAL [#6-] 3 +IMPVAL [#6+] 3 -IMPVAL [N^3] 3 -IMPVAL [N^3+] 4 -IMPVAL [N^2] 3 -#IMPVAL [N^2-,N^3-] 2 -IMPVAL [ND1^1] 1 -IMPVAL [ND2^1] 2 -IMPVAL [$(N([OD1])*)] 3 -IMPVAL [$(NC=O)] 3 -#IMPVAL [$(n(c=O)c=O)] 3 # aromatic diimide -IMPVAL [$(N(~[OD1])~[OD1])] 3 -IMPVAL [$([Nr0]C=[Nr0])] 3 -IMPVAL [$(N=*)] 2 -#IMPVAL [$([Nr0]=C[Nr0])] 3 -IMPVAL [$([Nr0]=C[Nr0])] 2 -IMPVAL [$([N+r0]=C[Nr0])] 3 # patch from Steve Constable -IMPVAL [$([#7D1-]=*)] 1 -IMPVAL [N^1-] 1 # N2O [N-]=[N+]=O (valence=1, bondorder=2) -IMPVAL [N-] 2 -IMPVAL [n] 2 -IMPVAL [n-] 2 -IMPVAL [n+] 3 -IMPVAL [$([#7-]=*)] 1 -IMPVAL [$([#7+]#*)] 2 +IMPVAL [#7] 3 # Generic nitrogen +IMPVAL [#7+] 4 +IMPVAL [#7-] 2 -IMPVAL [$([#8+]#*)] 1 +IMPVAL [#8] 2 # Generic oxygen +IMPVAL [#8-] 1 +IMPVAL [#8-2] 0 +IMPVAL [#8+] 3 -IMPVAL [#8] 2 # Generic O -IMPVAL [#8-] 1 -IMPVAL [#8-2] 0 -IMPVAL [#8+] 3 -IMPVAL [$([#8]=*)] 1 -IMPVAL [#8^1+] 1 # carbon monoxide +IMPVAL [#14] 4 # Generic silicon +IMPVAL [#14+] 3 +IMPVAL [#14-] 3 -IMPVAL [#9D0] 1 -IMPVAL [#9D0-] 0 #[F-] +IMPVAL [#16] 2 # Generic sulfur +IMPVAL [#16-] 1 +IMPVAL [#16-2] 0 +IMPVAL [#16+] 3 +IMPVAL [#16v3] 4 # Higher valence S +IMPVAL [#16v5] 6 -IMPVAL [#13] 3 # generic Al +IMPVAL [#17] 1 +IMPVAL [#17-] 0 +IMPVAL [#17v2] 3 # Higher valence Cl +IMPVAL [#17v4] 5 +IMPVAL [#17v6] 7 -IMPVAL [#14] 4 # regular sp3 Si -IMPVAL [#14^2] 2 # silylenes -IMPVAL [$([#14]=*)] 3 # e.g., Si=O -IMPVAL [$([#14](=*)=*)] 0 # e.g., Si(=O)=O -IMPVAL [$([#14+]#*)] 1 # just for InChI Si2 -IMPVAL [$([#14-]#*)] 1 # just for InChI Si2 - -IMPVAL [#15D4] 4 -IMPVAL [#15D3] 3 -IMPVAL [#15D2] 3 -IMPVAL [#15D1^3] 3 -IMPVAL [#15D1^2] 3 -IMPVAL [#15D0] 3 -IMPVAL [$([#15]=[#6])] 2 -IMPVAL [pD2] 2 -IMPVAL [$([#15]=[#8])] 4 # phosphinate (double bond = one valence) -IMPVAL [$([#15+](=[#8])([#8]))] 0 # R-P+O2 - -IMPVAL [#16] 2 #revised CM April 2008 -IMPVAL [$([#16D1]=*)] 1 -IMPVAL [$([#16D3](=O)(=O)-O)] 4 #e.g. O[S](=O)=O -IMPVAL [$([#16D2](=O)-O)] 3 # e.g. O[S]=O -IMPVAL [#16D1-] 1 - -IMPVAL [#17D0] 1 -IMPVAL [#17D0-] 0 #[Cl-] - IMPVAL [#31] 3 IMPVAL [#32] 4 IMPVAL [#33] 3 -IMPVAL [as] 3 IMPVAL [#34] 2 -IMPVAL [#35D0] 1 -IMPVAL [#35D0-] 0 #[Br-] +IMPVAL [#35] 1 +IMPVAL [#35-] 0 #[Br-] +IMPVAL [#35v2] 3 # Higher valence Br +IMPVAL [#35v4] 5 +IMPVAL [#35v6] 7 #IMPVAL [#49,#81] 3 # In, Tl, like Ga #IMPVAL [#50,#82] 4 # Sn, Pb, like Ge #IMPVAL [#51,#83] 3 # Sb, Bi, like As IMPVAL [#52] 2 -IMPVAL [#53D0] 1 -IMPVAL [#53D0-] 0 #[I-] +IMPVAL [#53] 1 +IMPVAL [#53-] 0 #[I-] +IMPVAL [#53v2] 3 # Higher valence I +IMPVAL [#53v4] 5 +IMPVAL [#53v6] 7 + # #EXTERNAL TYPE RULES # Index: src/atom.cpp =================================================================== --- src/atom.cpp (revision 4157) +++ src/atom.cpp (working copy) @@ -968,7 +968,7 @@ unsigned int bosum; OBBondIterator i; - bosum = GetImplicitValence(); + bosum = GetValence() + ImplicitHydrogenCount(); for (bond = ((OBAtom*)this)->BeginBond(i);bond;bond = ((OBAtom*)this)->NextBond(i)) { @@ -989,7 +989,7 @@ atomtyper.AssignImplicitValence(*((OBMol*)((OBAtom*)this)->GetParent())); // _impval is assigned by the atomtyper -- same as calling GetImplicitValence() - int impval = _impval - GetValence(); + int impval = _impval - BOSum(); // we need to modify this implicit valence if we're a radical center int mult = GetSpinMultiplicity(); Index: src/graphsym.cpp =================================================================== --- src/graphsym.cpp (revision 4157) +++ src/graphsym.cpp (working copy) @@ -174,7 +174,7 @@ else if (bond->IsAromatic()) count += 1.6f; } } - if (atom->GetAtomicNum() == 7 && atom->IsAromatic() && atom->GetImplicitValence() == 3) { + if (atom->GetAtomicNum() == 7 && atom->IsAromatic() && atom->GetValence()+atom->ImplicitHydrogenCount() == 3) { count += 1; // [nH] - add another bond } return(int(count + 0.5)); // round to nearest int Index: src/mol.cpp =================================================================== --- src/mol.cpp (revision 4157) +++ src/mol.cpp (working copy) @@ -2037,7 +2037,7 @@ atom->IsSulfur() || atom->IsPhosphorus())) continue; - hcount = atom->GetImplicitValence() - atom->GetValence(); + hcount = atom->GetImplicitValence() - atom->BOSum(); //Jan 05 Implicit valency now left alone; use spin multiplicity for implicit Hs int mult = atom->GetSpinMultiplicity(); @@ -2191,7 +2191,7 @@ int hcount,count=0; vector<pair<OBAtom*,int> > vhadd; - hcount = atom->GetImplicitValence() - atom->GetValence(); + hcount = atom->GetImplicitValence() - atom->BOSum(); //Jan 05 Implicit valency now left alone; use spin multiplicity for implicit Hs int mult = atom->GetSpinMultiplicity(); @@ -2347,7 +2347,7 @@ || (!atom->IsHydrogen() && atom->ExplicitHydrogenCount(true)!=0)//exclude D,T || atom->HasNoHForced()) { - diff=atom->GetImplicitValence() - (atom->GetHvyValence() + atom->ExplicitHydrogenCount()); + diff=atom->GetImplicitValence() - atom->BOSum(); if (diff) atom->SetSpinMultiplicity(diff+1);//radicals =2; all carbenes =3 } @@ -2555,7 +2555,7 @@ } if (secondpass && atom->IsNitrogen() && atom->GetFormalCharge() == 0 && - atom->GetImplicitValence() == 2) //try protonating the nitrogen + atom->GetValence() + atom->ImplicitHydrogenCount() == 2) //try protonating the nitrogen { atom->IncrementImplicitValence(); if (ExpandKekule(mol,va,i+1,maxv,secondpass)) Index: src/typer.cpp =================================================================== --- src/typer.cpp (revision 4157) +++ src/typer.cpp (working copy) @@ -243,7 +243,7 @@ OBAtom *atom; vector<OBAtom*>::iterator k; for (atom = mol.BeginAtom(k);atom;atom = mol.NextAtom(k)) - atom->SetImplicitValence(atom->GetValence()); + atom->SetImplicitValence(atom->BOSum()); vector<vector<int> >::iterator j; vector<pair<OBSmartsPattern*,int> >::iterator i; @@ -261,8 +261,8 @@ for (atom = mol.BeginAtom(k);atom;atom = mol.NextAtom(k)) { - if (atom->GetImplicitValence() < atom->GetValence()) - atom->SetImplicitValence(atom->GetValence()); + if (atom->GetImplicitValence() < atom->BOSum()) + atom->SetImplicitValence(atom->BOSum()); } // FF Come back to the initial flags @@ -513,7 +513,7 @@ //sanity check - exclude all 4 substituted atoms and sp centers for (atom = mol.BeginAtom(i);atom;atom = mol.NextAtom(i)) { - if (atom->GetImplicitValence() > 3) + if (atom->GetValence() > 3) { _vpa[atom->GetIdx()] = false; continue; Index: test/files/formularesults.txt =================================================================== --- test/files/formularesults.txt (revision 4157) +++ test/files/formularesults.txt (working copy) @@ -247,8 +247,8 @@ C15H19NO9S 389.378 389.078 C24H31NO15S 605.566 605.141 C8H5BrO3PSe 338.961 338.832 -C5H12N2O3PS 211.199 211.031 -C3H10N2O3PS 185.162 185.015 +C5H11N2O3PS 210.191 210.023 +C3H9N2O3PS 184.154 184.007 C10H21BN2O3 228.096 228.165 C7H17BN2O3 188.032 188.133 C4H13BN2O 115.97 116.112
------------------------------------------------------------------------------ Beautiful is writing same markup. Internet Explorer 9 supports standards for HTML5, CSS3, SVG 1.1, ECMAScript5, and DOM L2 & L3. Spend less time writing and rewriting code and more time creating great experiences on the web. Be a part of the beta today. http://p.sf.net/sfu/beautyoftheweb
_______________________________________________ OpenBabel-Devel mailing list OpenBabel-Devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/openbabel-devel