Hi Philipp, This is an embarrassingly parallel problem (that's the actual technical term, so no need to feel embarrassed. :-), meaning there's no need for communication between threads or processes, which makes it really easy: just split the search space, run a separate job for each fraction, and combine the outputs at the end.
For example, if your building block file "A" has 100 structures and you want to use 4 CPUs, just split it into four files of 25 structures, run a job using each file, and concatenate the outputs when the four jobs are done. IMHO there's no need for any fancy multiprocessing modules; running a completely independent process gives you more flexibility (you can run each subjob on a different machine, submit it to a queue, email it to your cousin, or whatever...) Ivan On Fri, Jun 11, 2021 at 6:47 AM Philipp Otten <philipp.ott...@gmail.com> wrote: > Dear all lovely people, > first of all, I'm rather new to to programming/python/Rdkit and probably > my issue is quite easy to solve if you're more experienced. > So I wrote a Python-Sycript simulating a reaction-workflow for a multi > step synthesis with a lot of different building blocks. The program works > as intended, but it takes a lot of time (few days) because it uses only one > CPU thread. Therefore I thought about using more via > multiprocessing/multithreading, but I couldn't get it to run. I tried a > lot, but didn't even figure out exactly where to start. Maybe you guys can > give me a hint in the right direction? > > First of all the "Synthesis-Class": > > from rdkit import Chem > from rdkit.Chem import AllChem > > > class Synthesis: > """Combination of the single synthesis steps.""" > > def __init__(self, > sdf_a, sdf_b, sdf_f, sdf_k, > s1_smarts, s2_smarts, s3_smarts, s4_smarts, s5_smarts, > ): > """Initialize building block molecules""" > self.sdf_a = sdf_a > self.sdf_b = sdf_b > self.sdf_f = sdf_f > self.sdf_k = sdf_k > self.s1_smarts = s1_smarts > self.s2_smarts = s2_smarts > self.s3_smarts = s3_smarts > self.s4_smarts = s4_smarts > self.s5_smarts = s5_smarts > > def react1(self, t1): > """First step of the reaction""" > rxn1 = AllChem.ReactionFromSmarts(self.s1_smarts) > with open(t1, "w") as t1: > with open(self.sdf_a, "r") as f_a: > while True: > line_a = f_a.readline() > > if not line_a: > break > > line_a = Chem.MolFromSmiles(line_a) > > with open(self.sdf_b) as f_b: > while True: > line_b = f_b.readline() > > if not line_b: > break > > line_b = Chem.MolFromSmiles(line_b) > > p1 = rxn1.RunReactants((line_a, line_b)) > p1 = [x for t in p1 for x in t] > for x in p1: > x = Chem.MolToSmiles(x) > t1.write(x) > t1.write("\n") > f_a.close() > f_b.close() > t1.close() > > def react2(self, t1, t2): > """Second step of the synthesis""" > rxn2 = AllChem.ReactionFromSmarts(self.s2_smarts) > with open(t2, "w") as t2: > with open(t1, "r") as f_c: > while True: > line_c = f_c.readline() > if not line_c: > break > > line_c = Chem.MolFromSmiles(line_c) > > if line_c == None: > pass > else: > p2 = rxn2.RunReactants((line_c,)) > p2 = [x for t in p2 for x in t] > for x in p2: > x = Chem.MolToSmiles(x) > t2.write(x) > t2.write("\n") > f_c.close() > t2.close() > > def react3 (self, t2, t3): > """Third step of the synthesis""" > rxn3 = AllChem.ReactionFromSmarts(self.s3_smarts) > with open(t3, "w") as t3: > with open(t2, "r") as f_d: > while True: > line_d = f_d.readline() > > if not line_d: > break > > line_d = Chem.MolFromSmiles(line_d) > > p3 = rxn3.RunReactants((line_d,)) > p3 = [x for t in p3 for x in t] > for x in p3: > x = Chem.MolToSmiles(x) > t3.write(x) > t3.write("\n") > f_d.close() > t3.close() > > > def react4 (self, t3, t4): > """Fourth step of the synthesis""" > rxn4 = AllChem.ReactionFromSmarts(self.s4_smarts) > > with open(t4, "w") as t4: > with open(self.sdf_f, "r") as f_f: > while True: > line_f = f_f.readline() > > if not line_f: > break > > line_f = Chem.MolFromSmiles(line_f) > > with open(t3) as f_g: > while True: > line_g = f_g.readline() > > if not line_g: > break > > line_g = Chem.MolFromSmiles(line_g) > > p4 = rxn4.RunReactants((line_g, line_f)) > p4 = [x for t in p4 for x in t] > for x in p4: > x = Chem.MolToSmiles(x) > t4.write(x) > t4.write("\n") > f_f.close() > f_g.close() > t4.close() > > def react5(self, t4, t5): > """Last step of the synthesis""" > rxn5 = AllChem.ReactionFromSmarts(self.s5_smarts) > with open(t5, "w") as t5: > with open(self.sdf_k, "r") as f_k: > while True: > line_k = f_k.readline() > > if not line_k: > break > line_k = Chem.MolFromSmiles(line_k) > > with open(t4) as f_l: > while True: > line_l = f_l.readline() > > if not line_l: > break > > line_l = Chem.MolFromSmiles(line_l) > > p5 = rxn5.RunReactants((line_l, line_k)) > p5 = [x for t in p5 for x in t] > for x in p5: > x = Chem.MolToSmiles(x) > t5.write(x) > t5.write("\n") > > and then the actual execution program: > > from rdkit import Chem > from rdkit.Chem import AllChem > from synthesis_class2_1 import Synthesis > from duplicate_class import DuplicateRemoval #simple class to remove > duplicates from .txt files. > > > sdf_a = ".txt file full of SMILE-Strings" > sdf_b = " .txt file full of SMILE-Strings " > sdf_f = " .txt file full of SMILE-Strings " > sdf_k = " .txt file full of SMILE-Strings " > > s1_smarts = "SMARTS for the reaction" > s2_smarts = " SMARTS for the reaction " > s3_smarts = " SMARTS for the reaction " > s4_smarts = " SMARTS for the reaction " > s5_smarts = " SMARTS for the reaction " > > t1 = ".txt file with all the products of reaction 1" > t1d = " .txt file without duplicates for the products of reaction 1 " > t2 = " .txt file with all the products of reaction 2 " > t2d = " .txt file without duplicates for the products of reaction 2 " > t3 = " .txt file with all the products of reaction 3" > t3d = " .txt file without duplicates for the products of reaction 3 " > t4 = " .txt file with all the products of reaction 4 " > t4d = " .txt file without duplicates for the products of reaction 4 " > t5 = " .txt file with all the products of reaction 5 " > t5d = " .txt file without duplicates for the products of reaction 5 " > > rxnrun = Synthesis(sdf_a, sdf_b, sdf_f, sdf_k, s1_smarts, s2_smarts, > s3_smarts, s4_smarts, s5_smarts) > rxnrun.react1(t1) > DuplicateRemoval.check_duplicates(t1, t1d) > rxnrun.react2(t1d, t2) > DuplicateRemoval.check_duplicates(t2, t2d) > rxnrun.react3(t2d, t3) > DuplicateRemoval.check_duplicates(t3, t3d) > rxnrun.react4(t3, t4) > DuplicateRemoval.check_duplicates(t4, t4d) > rxnrun.react5(t4d, t5) > DuplicateRemoval.check_duplicates(t5, t5d) > > > That's it. I tried to implement different "types" of > multiprocessing/threading for each "RunReactants()" but haven't figured out > how. Especially because everyone says, that implement multiprocessing in > python/Rdkit is rather easy. > Any tips apprecciated. > Many, many thanks for your support. > Kind regards > Philipp > _______________________________________________ > Rdkit-discuss mailing list > Rdkit-discuss@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/rdkit-discuss >
_______________________________________________ Rdkit-discuss mailing list Rdkit-discuss@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/rdkit-discuss