Hi Carsten, How are the fragments expressed? With attachment points marked with "[*:1]", "[*:2]" and "[*:3]" atoms?
One technique is to rewrite the SMILES to use closures. (See https://onlinelibrary.wiley.com/doi/10.1002/qsar.200310008 or http://www.dalkescientific.com/writings/diary/archive/2005/05/07/attachment_points.html ). For example, if your core SMILES are: [*:1]c1ncc([*:2])cn1 CC([*:2])O[*:1] and your R1 contains *F Cl* Br* and your R2 contains *CCO CO* then you could rewrite these to use "%91" to connect the [*:1] with the R1 "*" and use "%92" to connect the [*:2] with the R2 "*", using dot-disconnected terms. For example: [*:1]c1ncc([*:2])cn1 + *F + *CCO can be rewritten as c%911ncc%92cn1.F%91.C%92CO which is parsed and canonicalized to: OCCc1cnc(F)nc1 Rewriting the SMILES this way is a bit tricky. I've attached a program which does it for you. Running it on the above gives: % cat core.smi [*:1]c1ncc([*:2])cn1 CC([*:2])N[*:1] % cat r1.smi *F Cl* Br* % cat r2.smi *CCO CO* % python enumerate.py --R1 r1.smi --R2 r2.smi core.smi c1%91ncc%92cn1.F%91.C%92CO -> OCCc1cnc(F)nc1 c1%91ncc%92cn1.F%91.CO%92 -> COc1cnc(F)nc1 c1%91ncc%92cn1.Cl%91.C%92CO -> OCCc1cnc(Cl)nc1 c1%91ncc%92cn1.Cl%91.CO%92 -> COc1cnc(Cl)nc1 c1%91ncc%92cn1.Br%91.C%92CO -> OCCc1cnc(Br)nc1 c1%91ncc%92cn1.Br%91.CO%92 -> COc1cnc(Br)nc1 CC(O%91)%92.F%91.C%92CO -> CC(CCO)OF CC(O%91)%92.F%91.CO%92 -> COC(C)OF CC(O%91)%92.Cl%91.C%92CO -> CC(CCO)OCl CC(O%91)%92.Cl%91.CO%92 -> COC(C)OCl CC(O%91)%92.Br%91.C%92CO -> CC(CCO)OBr CC(O%91)%92.Br%91.CO%92 -> COC(C)OBr It also supports --R3 if your core has 3 R-groups, with the third core point labeled [*:3]. Best regards Andrew da...@dalkescientific.com
"""Enumerate a library with a core and up to 3 sets of R-groups The core and R-group files contain one SMILES per line. The core SMILES must use labeled "*" wildcards, like: [*:1]c1ncc([*:2])cn1 where [*:1] is the attachment point for R1, [*:2] is the attachment point for R3, and [*:3] is the attachment point for R3. The R-group SMILES must have a single unlabled "*" wildcard, like: CO* -or- C(*)CO The program is used like this: python enumerate.py --R1 r1.smi --R2 r2.smi --R3 r3.smi core.smi """ # Written by Andrew Dalke <da...@dalkescientific.com> # 6 July 2022 import argparse from rdkit import Chem import itertools # Generate a SMILES string such that the wildcard is NOT the first atom. # This makes it easier to do an in-place string substitution of the # wildcard term with a ring closure. # # The approach finds a non-wildcard atom and uses that as the root # to generate the SMILES. # # Also verify the number of wildcard atoms is correct, and that # they have one (and only one) single bond, no isotope, and no charge. # # Also verify that if it's supposed to be labeled (as for the core # structures) then it uses [*:1], [*:2], or [*:3], and if it's # not labeled (as for the R-groups) then it does not use labels def get_reordered_smiles(mol, num_wildcards_required, is_labeled): num_wildcards = 0 non_wildcard_atom = None for atom in mol.GetAtoms(): # Do some validation if atom.GetAtomicNum() == 0: bonds = atom.GetBonds() if len(bonds) != 1: raise ValueError("'*' atom must have only one bond") if bonds[0].GetBondType() != Chem.BondType.SINGLE: raise ValueError("'*' atom must have only one single bond") if atom.GetFormalCharge() != 0: raise ValueError("'*' atom must be uncharged") if atom.GetIsotope() != 0: raise ValueError("'*' atom must have no isotope") num_wildcards += 1 elif non_wildcard_atom is None: # found a non-wildcard non_wildcard_atom = atom if num_wildcards != num_wildcards_required: raise ValueError( f"expecting {num_wildcards_required} '*' atoms, found {num_wildcards}") if non_wildcard_atom is None: raise ValueError( "R-group must have at least one non-'*' atom") # The generated SMILES does not begin with a wildcard. smiles = Chem.MolToSmiles(mol, rootedAtAtom=non_wildcard_atom.GetIdx()) # Double check some assertion if is_labeled: # Ensure enough labeled wildcards are present for substr in ("[*:1]", "[*:2]", "[*:3]")[:num_wildcards_required]: if substr not in smiles: raise ValueError(f"Expecting {substr}") else: # Ensure it isn't using labeled wildcards if "[*:" in smiles: raise ValueError(f"Must not be labeled wildcards") # Ensure RDKit generates wildcards without "[]"s assert "[*]" not in smiles, smiles # There must be a wildcard assert "*" in smiles, smiles return smiles # Convert "(*)" or "*" to the attachment "%91", "%92", or "%93" def unlabeled_wildcard_to_attachment(smiles_list, r): new_smiles_list = [] attachment = "%" + str(90 + r) for smiles in smiles_list: if "(*)" in smiles: smiles = smiles.replace("(*)", attachment) else: smiles = smiles.replace("*", attachment) new_smiles_list.append(smiles) return new_smiles_list # Convert "([*:1])", "[*:1]", "([*:2])", "[*:2]", and "([*:3])", # "[*:3]" to "%91", "%92", or "%93", respectively. def labeled_wildcard_to_attachment(smiles_list): new_smiles_list = [] for smiles in smiles_list: # Not particularly elegant, but it works. new_smiles = smiles.replace( "([*:1])", "%91").replace( "[*:1]", "%91").replace( "([*:2])", "%92").replace( "[*:2]", "%92").replace( "([*:3])", "%93").replace( "[*:3]", "%93") new_smiles_list.append(new_smiles) return new_smiles_list def parse_and_reorder_smiles_file(option, filename, num_wildcards_required, is_labeled): def die(msg): raise SystemExit(f"{msg}: {line!r}, line {lineno} of {option} {filename!r}") reordered_fragments = [] with open(filename) as infile: # Read lines from the file for lineno, line in enumerate(infile, 1): # Must be valid SMILES strings mol = Chem.MolFromSmiles(line) if mol is None: die("Cannot parse SMILES") # re-order the SMILES so the '*' atom is not first try: new_smiles = get_reordered_smiles( mol, num_wildcards_required, is_labeled) except ValueError as err: die(str(err)) reordered_fragments.append(new_smiles) return reordered_fragments #### parser = argparse.ArgumentParser( description = "enumerate a core and set of fragment files") parser.add_argument("--R1", metavar="FILENAME", required=True) parser.add_argument("--R2", metavar="FILENAME") parser.add_argument("--R3", metavar="FILENAME") parser.add_argument("core") def main(): args = parser.parse_args() # Process --R1, --R2, and --R3, which must have one unlabeled # wildcard atom libraries = [] for r, filename in enumerate([args.R1, args.R2, args.R3], 1): if filename is None: # Stop if --R2 or --R3 isn't specified. break reordered_R_groups = parse_and_reorder_smiles_file("--R{r}", filename, 1, 0) new_library = unlabeled_wildcard_to_attachment(reordered_R_groups, r) libraries.append(new_library) # Process the core file, which must have <num_r_groups> labeled wildcards num_r_groups = len(libraries) reordered_cores = parse_and_reorder_smiles_file("core", args.core, num_r_groups, 1) cores = labeled_wildcard_to_attachment(reordered_cores) # Enumerate the power series for row in itertools.product(cores, *libraries): row_smiles = ".".join(row) # parse and canonicalize new_smiles = Chem.CanonSmiles(row_smiles) print(row_smiles, "->", new_smiles) if __name__ == "__main__": main()
> On Jul 6, 2022, at 21:00, Carsten Bauer <carsten.ba...@bluewin.ch> wrote: > > Hello > > I have a structure with three substituents R1, R2 and R3 > R1 is an enumeration of 30+ SMILES > R2 and R3 each is an enumeration of <5 SMILES > Chemical space = 30 x 5 x 5 = 750+ in-silico compounds > > Can anyone share (i.e publish in a citable form) an RDKit code for this > permutation? > Is there a textbook example illustrating this daily question from the lab in > an example, please? > > I can’t follow > https://www.rdkit.org/docs/cppapi/EnumerationStrategyBase_8h_source.html > > Sorry. > > Many thanks for getting back. > Kindest regards > C. >
_______________________________________________ Rdkit-discuss mailing list Rdkit-discuss@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/rdkit-discuss