On Dec 13, 2012, at 3:32 PM, [email protected] wrote:
>> I think I figured out a way around that via some post-processing.
>
> Great!
> Now let's come to another question:
> How does one code the "complete-ring-only" variation?
> Can your code be adapated, or shall I do some post-processing?
Ideally the MCS code should be modified so that a user-defined
class can come in and replace the pruning and matching code.
Post-processing is definitely trickier for this one. I can get
what you want through a horrible bit of monkey-patching to
internal and unpublished parts of fmcs.
% python paul.py paul.smi --threshold 0.15 --complete-rings-only
frequency #atoms #bonds SMILES SMARTS
3 5 5 c1cncn1 [#6]:1:[#6]:[#7]:[#6]:[#7]:1
3 8 8 NCc1ccccc1 [#7]-!@[#6]-!@[#6]:1:[#6]:[#6]:[#6]:[#6]:[#6]:1
2 10 10 CCNCc1ccccc1
[#6]-!@[#6]-!@[#7]-!@[#6]-!@[#6]:1:[#6]:[#6]:[#6]:[#6]:[#6]:1
import sys
import re
from rdkit import Chem
import hacked_fmcs
import argparse
parser = argparse.ArgumentParser("Find fragments")
parser.add_argument("filename",
help="SMILES filename")
parser.add_argument("--threshold", "-t", type=float, default=0.5,
help="minimum match threshold (default: 0.5)")
parser.add_argument("--min-num-atoms", type=int, default=2,
help="minimum number of atom for the match (default: 2)")
parser.add_argument("--min-num-bonds", type=int, default=1,
help="minimum number of bonds for the match (default: 1)")
parser.add_argument("--complete-rings-only", action="store_true",
help="don't match part ring in a fragment")
args = parser.parse_args()
if args.min_num_atoms < 2:
parser.error("--min-num-atoms must be at least 2")
if args.min_num_bonds < 1:
parser.error("--min-num-bonds must be at least 2")
if args.threshold <= 0.0:
parser.error("--threshold must be positive")
table = Chem.GetPeriodicTable()
_atomic_num_to_symbol = dict((str(i), table.GetElementSymbol(i)) for i in range(105))
atom_pat = re.compile("\[#(\d+)\]")
def atomic_num_to_symbol(m):
return _atomic_num_to_symbol[m.group(1)]
def smarts_to_unique_smiles_fragment(smarts):
fragment_smiles = atom_pat.sub(atomic_num_to_symbol, smarts)
fragment_smiles = fragment_smiles.replace("!@", "").replace("@", "")
mol = Chem.MolFromSmiles(fragment_smiles, sanitize=False)
# Correct the aromaticity information
for bond in mol.GetBonds():
if bond.GetIsAromatic():
bond.GetBeginAtom().SetIsAromatic(1)
bond.GetEndAtom().SetIsAromatic(1)
return Chem.MolToSmiles(mol)
## Hack in code to check the results of the complete_rings_only check
only_has_complete_rings = {}
old_check_complete_rings_only = hacked_fmcs.check_complete_rings_only
def new_check_complete_rings_only(smarts, subgraph, enumeration_mol):
result = old_check_complete_rings_only(smarts, subgraph, enumeration_mol)
only_has_complete_rings[smarts] = result
return result
hacked_fmcs.check_complete_rings_only = new_check_complete_rings_only
hacked_fmcs._maximize_options[("all-matches", False)] = (
hacked_fmcs.no_pruning, hacked_fmcs.SingleBestAtoms)
hacked_fmcs._maximize_options[("all-matches", True)] = (
hacked_fmcs.no_pruning, hacked_fmcs.SingleBestAtomsCompleteRingsOnly)
##### Read in the input file
mols = []
for lineno, line in enumerate(open(args.filename)):
fields = line.split()
if not fields:
sys.stderr.write("Missing SMILES on line %d" % (lineno+1,))
continue
smiles = fields[0]
mol = Chem.MolFromSmiles(smiles)
if mol is None:
sys.stderr.write("Could not process SMILES %s on line %d\n" % (smiles, lineno+1))
continue
mols.append(mol)
mcs_result = hacked_fmcs.fmcs(mols,
threshold=args.threshold,
min_num_atoms=args.min_num_atoms,
complete_rings_only=args.complete_rings_only,
# fmcs doesn't implement min_num_bonds; use post-processing
maximize="all-matches")
# Convert to unique fragment SMILES
matches = []
unique_fragment_smiles = set()
for smarts, is_match in mcs_result.matches.items():
if not is_match: # remember, this is hacked code, and a bit ugly!
continue
if args.complete_rings_only:
if not only_has_complete_rings.get(smarts, False):
continue
fragment_smiles = smarts_to_unique_smiles_fragment(smarts)
if fragment_smiles in unique_fragment_smiles:
continue
unique_fragment_smiles.add(fragment_smiles)
pattern = Chem.MolFromSmarts(smarts)
if pattern.GetNumBonds() < args.min_num_bonds:
continue
matches.append( (pattern, smarts, fragment_smiles) )
def order_by_number_of_atoms(term):
return term[0].GetNumAtoms()
matches.sort(key=order_by_number_of_atoms)
print "frequency #atoms #bonds SMILES SMARTS"
for pattern, smarts, fragment_smiles in matches:
count = sum(1 for mol in mols if mol.HasSubstructMatch(pattern))
print count, pattern.GetNumAtoms(), pattern.GetNumBonds(), fragment_smiles, smarts
Longer term you likely want this as a supported, stable feature.
I am a consultant and available for custom software development. :)
But that discussion shouldn't be on the RDKit mailing list.
Andrew
[email protected]
------------------------------------------------------------------------------
LogMeIn Rescue: Anywhere, Anytime Remote support for IT. Free Trial
Remotely access PCs and mobile devices and provide instant support
Improve your efficiency, and focus on delivering more value-add services
Discover what IT Professionals Know. Rescue delivers
http://p.sf.net/sfu/logmein_12329d2d
_______________________________________________
Rdkit-discuss mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/rdkit-discuss