This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository qcumber.
commit 966a33aad7fbeb70a28f2f08489c8662e01ad4a0 Author: Andreas Tille <[email protected]> Date: Mon Mar 13 14:13:56 2017 +0100 New upstream version 1.0.9+dfsg --- QCumber.py | 600 ++++++++++++++++++++++++++++-------------------------- barplot.R | 56 +++-- batch_report.html | 10 +- classes.py | 96 +++------ config.txt | 6 +- helper.py | 11 +- parameter.txt | 13 +- report.tex | 14 +- 8 files changed, 410 insertions(+), 396 deletions(-) diff --git a/QCumber.py b/QCumber.py index c56f2a7..c7eccde 100755 --- a/QCumber.py +++ b/QCumber.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 __author__ = 'LieuV' -__version__ = "1.0.5" +__version__ = "1.0.9" from classes import * from helper import * @@ -11,7 +11,6 @@ import shutil import configparser from collections import OrderedDict import json -import csv #dependencies try: @@ -20,95 +19,102 @@ except ImportError: print('The "argparse" module is not available. Use Python >3.2.') sys.exit(1) +#parameter = configparser.ConfigParser(strict=False, interpolation=None) +#parameter.read(join(dirname(__file__), "parameter.txt")) -parameter = configparser.ConfigParser() -parameter.read(join(dirname(__file__), "parameter.txt")) +#r1_pattern = parameter["Pattern"]["R1"] +#r2_pattern = parameter["Pattern"]["R2"] +#sep_pattern = parameter["Pattern"]["sep"] +#lane_pattern = parameter["Pattern"]["lane"] +global parameter +global r1_pattern +global r2_pattern +global sep_pattern +global lane_pattern -file_extensions = parameter["Fileextension"]["fastq"] -r1_pattern = parameter["Pattern"]["R1"] -r2_pattern = parameter["Pattern"]["R2"] -lane_pattern = parameter["Pattern"]["lane"] - -def get_illumina_reads( output, tmp): - ## Add readsets to sample +def get_illumina_reads(tmp): readsets = [] + # split by lanes - if isinstance(arguments["r1"], list): - for lane in sorted(set([re.search(lane_pattern, x).group() for x in arguments["r1"]])): - # concat same files - r1_reads = [x for x in arguments["r1"] if lane in x] - readname = re.sub(r1_pattern + ".*", "", os.path.basename(r1_reads[0])) - if len(arguments["r1"]) != 1: - r1 = FastQFile(join_reads(r1_reads, tmp, readname + "_R1"), [toLatex(os.path.basename(x)) for x in r1_reads] ) - else: - r1 = FastQFile(r1_reads[0]) - if arguments["r2"]: - r2_reads = [x for x in arguments["r2"] if lane in x] + for lane in sorted(set([re.search(lane_pattern, x).group() for x in arguments["r1"]])): + # concat same lanes + r1_reads = [x for x in arguments["r1"] if lane in x] + readname = re.sub(r1_pattern + ".*", "", os.path.basename(r1_reads[0])) + if len(arguments["r1"]) != 1: + r1 = FastQFile(join_reads(r1_reads, tmp, readname + "_R1"), [toLatex(os.path.basename(x)) for x in r1_reads] ) + else: + r1 = FastQFile(r1_reads[0]) + if arguments["r2"]: + r2_reads = [x for x in arguments["r2"] if lane in x] - if len(r2_reads) != 1: - r2 = FastQFile(join_reads(r2_reads, tmp, readname + "_R2"),[toLatex(os.path.basename(x)) for x in r2_reads] ) - else: - r2 = FastQFile(r2_reads[0]) - readsets.append(ReadSet(r1,r2)) + if len(r2_reads) != 1: + r2 = FastQFile(join_reads(r2_reads, tmp, readname + "_R2"),[toLatex(os.path.basename(x)) for x in r2_reads] ) else: - readsets.append(ReadSet(r1)) - else: - r1 = FastQFile(arguments["r1"]) - if arguments["r2"]: - r2 = FastQFile(arguments["r2"]) - return [ReadSet(r1,r2)] + r2 = FastQFile(r2_reads[0]) + readsets.append(ReadSet(r1,r2)) else: - return [ReadSet(r1)] - - + readsets.append(ReadSet(r1)) return readsets -def run_analysis (readset, arguments, output, tmp): - print("Run FastQC...") - readset.run_FastQC(join(output, "FastQC")) - if arguments["trimBetter"]: - trimming_perc = arguments["trimBetter_threshold"] - else: - trimming_perc = "" - readset.run_Trimmomatic(arguments["adapter"], arguments["palindrome"],arguments["minlen"], arguments["trimOption"], trimming_perc, arguments["gz"]) - readset.trimRes = readset.trimRes.run_FastQC( tmp) - return readset - +def get_input(): -def check_input(arguments): if arguments["r1"]: if arguments["r2"]: return [arguments["r1"], arguments["r2"]] else: return [arguments["r1"]] - if arguments["technology"]=="Illumina": - if os.path.isdir(arguments["input"]): - files = os.listdir(arguments["input"]) - files = [os.path.join(arguments["input"], x) for x in files] - files = [x for x in files if os.path.isfile(x) & any([ext in x for ext in file_extensions])] - if len(files)==0: - sys.exit(arguments["input"] + " does not contain fastq files.") - return files + all_files = [] + bam_ext = [x.strip(" ") for x in parameter["Fileextension"]["bam"].split(",")] + fastq_ext = [x.strip(" ") for x in parameter["Fileextension"]["fastq"].split(",")] + if arguments["technology"]: + if arguments["technology"]=="IonTorrent": + fastq_ext = [] else: - return [arguments["input"]] - elif arguments["technology"]=="IonTorrent": + bam_ext = [] - if os.path.isfile(arguments["input"]): + for root, dirs, files in os.walk(arguments["input"]): + for file in files: + if any([file.endswith(ext) for ext in fastq_ext + bam_ext ]): + all_files.append(join(root,file)) - if arguments["input"].endswith(".bam"): - return arguments["input"] - else: - sys.exit("IonTorrent PGM input has to be a bam-file. Otherwise use -b. Use --help for more information.") + if not arguments["technology"]: + if len([x for x in all_files if any([ext in x for ext in bam_ext])]) == len(all_files): + arguments["technology"] = "IonTorrent" else: - sys.exit("Incorrect file input") - else: - sys.exit("Incorrect file input") - return None + arguments["technology"] = "Illumina" + #if arguments["technology"]: + # if arguments["technology"]=="Illumina": + # # fastq only + # all_files = [x for x in all_files if any([ext in x for ext in fastq_ext])] + # elif arguments["technology"]=="IonTorrent": + # # bam only + # all_files = [x for x in all_files if any([ext in x for ext in bam_ext])] + + if (len(all_files) == 0): + sys.exit(arguments["input"] + " does not contain fastq or bam files.") + + # find read pairs + all_files = sorted(list(all_files)) + paired_reads_pattern =sep_pattern.join([lane_pattern , "(" + r1_pattern + "|" + r2_pattern + ")","\d{3}"]) + if all([re.search(paired_reads_pattern, x) for x in all_files]): + paired_reads = [] + for setname, files in groupby(all_files, key=lambda x: "_".join(x.split("_")[:-3])): + temp = sorted(list(files)) + r1_reads = [x for x in temp if r1_pattern in x] + r2_reads = [x for x in temp if r2_pattern in x] + if len(r1_reads) != 0: + if len(r2_reads) != 0: + paired_reads.append([r1_reads, r2_reads]) + else: + paired_reads.append(r1_reads) + return paired_reads + else: # treat each file as sample + return [[x] for x in all_files ] + def getSetname(): try: - print("Renmae", arguments["rename"], arguments["r1"]) if arguments["rename"]: new_name = [arguments["rename"][x] for x in arguments["rename"].keys() if basename(arguments["r1"][0]).startswith(x)] @@ -120,51 +126,57 @@ def getSetname(): else: print("Renamed to " ,new_name) return new_name[0] - - if arguments["technology"] == "Illumina": - - if isinstance(arguments["r1"], list): - return "_".join(basename(arguments["r1"][0]).split("_")[:-3]) - else: - return getFilenameWithoutExtension(arguments["r1"], True) - else: - return getFilenameWithoutExtension(arguments["r1"], getBase=True) except: + print("Couldnt rename sample files.") + try: + paired_reads_pattern = sep_pattern.join([sep_pattern + lane_pattern, "(" + r1_pattern + "|" + r2_pattern + ")", "\d{3}.*"]) if isinstance(arguments["r1"], list): - return getFilenameWithoutExtension(arguments["r1"][0], getBase=True) + return re.sub(paired_reads_pattern, "" ,basename(arguments["r1"][0])) else: - return getFilenameWithoutExtension(arguments["r1"], getBase=True) + return re.sub(paired_reads_pattern,"",basename(arguments["r1"])) + except: + print("Problems getting samplenames") + return arguments["r1"] -def fillClasses(temp_bowtie_path = "", tmp=''): +def runAnalyses(temp_bowtie_path, tmp): try: output = getDir([arguments["output"], "QCResults"], True) - sample = Sample(getSetname(), output, arguments["reference"],arguments["threads"], arguments["technology"]) + if arguments["technology"].startswith("Illumina"): - readsets = get_illumina_reads( output, tmp) + readsets = get_illumina_reads(tmp) elif arguments["technology"] == "IonTorrent": print("IonTorrent", arguments["r1"]) fastqfile = bam_to_fastq(arguments["r1"],tmp) readsets = [ReadSet( fastqfile)] for rs in readsets: - readset = run_analysis(rs, arguments, output, tmp) - sample = sample.add_readSet(readset) + print("Run FastQC...") + rs.run_FastQC(join(output, "FastQC")) + if not arguments["notrimming"]: + if arguments["trimBetter"]: + trimming_perc = arguments["trimBetter_threshold"] + else: + trimming_perc = "" + rs.run_Trimmomatic(arguments["adapter"], arguments["palindrome"], arguments["minlen"], + arguments["trimOption"], trimming_perc, arguments["gz"]) + rs.trimRes = rs.trimRes.run_FastQC(tmp) + sample = sample.add_readSet(rs) if not arguments["nomapping"]: if arguments["save_mapping"]: - sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, join(arguments["output"], "QCResults", sample.name +".bam")) + sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, join(arguments["output"], "QCResults", sample.name +".bam"), not arguments["notrimming"]) else: - sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, "/dev/null") + sample.mappingRes = sample.run_Bowtie2(temp_bowtie_path, "/dev/null", not arguments["notrimming"]) if not arguments["nokraken"]: print("Run Kraken.") - sample = sample.run_Kraken(arguments["db"]) + sample = sample.run_Kraken(arguments["db"], not arguments["notrimming"]) sample.stop = datetime.datetime.strftime(datetime.datetime.now(),"%d.%m.%Y, %H:%M:%S") finally: shutil.rmtree(tmp) return sample -def writeReport(sample, arguments): +def writeReport(sample): report_name = os.path.join(sample.mainResultsPath, "Report", "summary_" + sample.name + "_" + datetime.datetime.strftime(datetime.datetime.now(), "%d-%m-%y_%H-%M")) print("Writing latex " ,report_name) @@ -172,15 +184,17 @@ def writeReport(sample, arguments): env.loader = FileSystemLoader(os.path.dirname(__file__)) template = env.get_template("report.tex") - pdf_latex = template.render(sample=sample, pipeline=Pipeline(), trim_param = sample.readSets[0].trimRes.print_param(arguments["palindrome"],arguments["minlen"], arguments["trimOption"])) + try: + trim_param= sample.readSets[0].trimRes.print_param(arguments["palindrome"], arguments["minlen"], arguments["trimOption"]) + except: + trim_param = "None" + pdf_latex = template.render(sample=sample, pipeline=Pipeline(), trim_param = trim_param ) latex = open(report_name + ".tex", "w") latex.write(pdf_latex) latex.close() process = subprocess.Popen(["pdflatex", "-interaction=nonstopmode", "-output-directory=" + join(sample.mainResultsPath, "Report"), report_name + ".tex"], stdout = subprocess.PIPE, stderr = subprocess.PIPE) - for line in iter(process.stdout.readline, b''): - pass #print(line) for line in iter(process.stderr.readline, b''): print(line) @@ -191,6 +205,140 @@ def writeReport(sample, arguments): os.remove(report_name + ext) except OSError: pass +def check_input_validity(): + # + # Check options for Trimmomatic + #pattern = re.match("(?P<option>\w+):(?P<value1>\d+):(?P<value2>\d+)", arguments["trimOption"]) + #if (arguments["trimOption"] != ""): + # if not ((pattern.group("option") == "SLIDINGWINDOW") or (pattern.group("option") == "MAXINFO")): + # sys.exit("Check trimOption. Only 'SLIDINGWINDOW' or 'MAXINFO' allowed") + # try: + # int(pattern.group("value1")) + # int(pattern.group("value2")) + # except: + # sys.exit("Check trimOptions.") + + # + # Check if adapter is required + if not (arguments['notrimming'] or arguments['adapter'] or arguments["technology"] == "IonTorrent") : + sys.exit("Illumina projects requires an adapter file") + + # + # Check if reference is required and valid + if arguments["nomapping"]: + arguments["reference"] = "" + else: + if not arguments["reference"]: + sys.exit("Mapping needs a reference.") + else: + if not os.path.exists(arguments["reference"]): + sys.exit("Reference does not exist.") + + # + #Check validity of Kraken DB + if not arguments["nokraken"]: + if not os.path.exists(arguments["db"]): + sys.exit(arguments["db"] + " does not exist. Enter a valid database for kraken") + else: + if "database.kdb" not in os.listdir(arguments["db"]): + sys.exit("database " + arguments["db"] + " does not contain necessary file database.kdb") + # + # Check input + if arguments["input"]: + if not os.path.exists(arguments["input"]): + sys.exit(arguments["input"] + " does not exist.") + else: + if not os.path.exists(arguments["r1"]): + sys.exit(arguments["r1"] + " does not exist.Input file required. Use option -input or -1 / -2") + + if arguments["r2"]: + if not os.path.exists(arguments["r2"]): + sys.exit(arguments["r2"] + " does not exist.") + +def get_defaults(): + if arguments["forAssembly"]: + if not arguments["trimBetter_threshold"]: + arguments["trimBetter_threshold"] = parameter["forAssembly." + arguments['technology']][ + "trimBetter_threshold"] + if not arguments["trimOption"]: + arguments["trimOption"] = parameter["forAssembly." + arguments['technology']]["trimOption"] + + elif arguments["forMapping"]: + if not arguments["trimBetter_threshold"]: + arguments["trimBetter_threshold"] = parameter["forMapping." + arguments['technology']][ + "trimBetter_threshold"] + if not arguments["trimOption"]: + arguments["trimOption"] = parameter["forMapping." + arguments['technology']]["trimOption"] + + if not arguments["trimOption"]: + arguments["trimOption"] = parameter["Trimmomatic"]["trimOption"] + if not arguments["trimBetter_threshold"]: + arguments["trimBetter_threshold"] = parameter["Trimmomatic"]["trimBetter_threshold"] + + +def plot(): + try: + # + # Plot BOXPLOTS + boxplots = [{"file": "Per_sequence_quality_scores.csv", + "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_quality_scores.png"), + "title": "Per sequence quality scores", + "ylab": "Mean Sequence Quality (Phred Score)", + "xlab": "Sample"}, + {"file": "Sequence_Length_Distribution.csv", + "output": join(arguments["output"], "QCResults/Report/src/img", "Sequence_Length_Distribution.png"), + "title": "Sequence Length Distribution", + "ylab": "Sequence Length (bp)", + "xlab": "Sample"}, + {"file": "Per_sequence_GC_content.csv", + "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_GC_content.png"), + "title": "Per sequence GC content", + "ylab": "Mean GC content (%)", + "xlab": "Sample"}] + for plot in boxplots: + process = subprocess.Popen(" ".join(["Rscript --vanilla ", join(os.path.dirname(__file__), "boxplot.R"), + join(arguments["output"], "QCResults", "Report", "src", plot["file"]), + plot["output"], '"' + plot["title"] + '"', '"' + plot["xlab"] + '"', + '"' + plot["ylab"] + '"']), + stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) + #for line in iter(process.stderr.readline, b''): + # print(line) + process.communicate() + + # + # Plot BARPLOTS + process = subprocess.Popen( + " ".join(["Rscript --vanilla ", join(os.path.dirname(__file__), "barplot.R"), join(arguments["output"], "QCResults/Report/src", "summary.json"), + join(arguments["output"], "QCResults/Report/src/img")]), stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) + process.communicate() + except: + print("Couldnt plot summary") + + +def writeSummaryTex(batch_json): + try: + summary_latex_name = join(arguments["output"], "QCResults", "Report", "summary.tex") + with open(summary_latex_name, "w") as summary_latex: + summary_latex.write("\\rowcolors{2}{gray!10}{}\n\\begin{table}[H]\\begin{tabular}{lcc}\n") + summary_latex.write( + "\\rowcolor{tableBlue} \\textbf{Sample} & \\textbf{Reads after trimming [\#]} & \\textbf{Reads after trimming [\%]} &\\textbf{Map to reference [\#]} & \\textbf{Map to reference[\%]} & \\textbf{Classified (Kraken)} & \\textbf{Shorter fragments [\%]} \\ \n") + + for i in range(0, len(batch_json["summary"])): + summary_latex.write( + str(batch_json["summary"][i]["setname"]) + " & " + + str(batch_json["summary"][i]["Reads after trimming [#]"]) + " & " + + str(batch_json["summary"][i]["Reads after trimming [%]"]) + " & " + + str(batch_json["summary"][i]["Map to reference [#]"]) + " & " + + str(batch_json["summary"][i]["Map to reference [%]"]) + " & " + + str(batch_json["summary"][i]["Classified (Kraken)"]) + " & " + + str(batch_json["summary"][i]["Shorter fragments"]) + "\\\\\n" + ) + summary_latex.write("\\end{tabular}\n") + summary_latex.write("\\caption{Statistics on reads after trimming. }") + summary_latex.close() + except: + + print("Couldnt write summary.tex file.") def main(arguments): @@ -204,7 +352,12 @@ def main(arguments): makedirs(join(arguments["output"], "QCResults", "Report", "src"), exist_ok=True) makedirs(join(arguments["output"], "QCResults", "Report", "src", "img"), exist_ok=True) + if not os.path.exists(join(arguments["output"], "QCResults", "Report", "lib" )): + shutil.copytree(join(os.path.dirname(__file__),"lib"),join(arguments["output"], "QCResults", "Report", "lib" )) + + #### # rename sample names + #### if arguments["rename"]: rename_dict = {} with open(arguments["rename"], "r") as csvfile: @@ -212,159 +365,58 @@ def main(arguments): row = line.split("\t") rename_dict[row[0]] = row[1].strip() arguments["rename"] = rename_dict + + ### # create new boxplot csv files + ### for csv in ["Per_sequence_quality_scores.csv", "Per_sequence_GC_content.csv", "Sequence_Length_Distribution.csv"]: new_csv = open(join(arguments["output"], "QCResults", "Report", "src", csv), "w") new_csv.write("Sample,Read,Lane,Trimmed,Value,Count\n") new_csv.close() - ## - # SAV + + ### + # Write SAV + ### try: if arguments["sav"]: write_SAV(arguments["sav"], join(arguments["output"], "QCResults", "Report", "src")) except: print("Couldnt write SAV.") - ## - if not os.path.exists(join(arguments["output"], "QCResults", "Report", "lib" )): - shutil.copytree(join(os.path.dirname(__file__),"lib"),join(arguments["output"], "QCResults", "Report", "lib" )) - if arguments["technology"].startswith("Illumina"): - technology = "Illumina" - else: - technology = "IonTorrent" - if arguments["forAssembly"]: - if not arguments["trimBetter_threshold"]: - arguments["trimBetter_threshold"] = parameter["forAssembly." + technology]["trimBetter_threshold"] - if not arguments["trimOption"]: - arguments["trimOption"] = parameter["forAssembly." + technology]["trimOption"] - - elif arguments["forMapping"]: - if not arguments["trimBetter_threshold"]: - arguments["trimBetter_threshold"] = parameter["forMapping." + technology]["trimBetter_threshold"] - if not arguments["trimOption"]: - arguments["trimOption"] = parameter["forMapping." + technology]["trimOption"] - - if not arguments["trimOption"]: - arguments["trimOption"] = parameter["Trimmomatic"]["trimOption"] - if not arguments["trimBetter_threshold"]: - arguments["trimBetter_threshold"] = parameter["Trimmomatic"]["trimBetter_threshold"] - - # check input - pattern = re.match("(?P<option>\w+):(?P<value1>\d+):(?P<value2>\d+)", arguments["trimOption"]) - if(arguments["trimOption"]!=""): - if not ((pattern.group("option") =="SLIDINGWINDOW") or (pattern.group("option") =="MAXINFO")): - sys.exit("Check trimOption. Only 'SLIDINGWINDOW' or 'MAXINFO' allowed") - try: - int(pattern.group("value1")) - int(pattern.group("value2")) - except: - sys.exit("Check trimOptions.") - - if arguments['technology'] == "Illumina": - if not arguments['adapter']: - sys.exit("Illumina projects requires an adapter file") - if arguments["nomapping"]: - arguments["reference"] = "" - else: - if not arguments["reference"]: - sys.exit("Mapping needs a reference.") - else: - if not os.path.exists(arguments["reference"]): - sys.exit("Reference does not exist.") - - if not arguments["nokraken"]: - if not os.path.exists(arguments["db"]): - sys.exit(arguments["db"] + " does not exist. Enter a valid database for kraken") - else: - if "database.kdb" not in os.listdir(arguments["db"]): - sys.exit("database "+arguments["db"] +" does not contain necessary file database.kdb") - if not arguments["input"]: - if arguments["r1"]: - print(os.path.abspath(arguments["r1"]),os.path.exists(arguments["r1"])) - if not os.path.exists(arguments["r1"]): - sys.exit(arguments["r1"] + " does not exist.") - else: - sys.exit("Input file required. Use option -input or -1 / -2") - - if arguments["r2"]: - if not os.path.exists(arguments["r2"]): - sys.exit(arguments["r2"] + " does not exist.") + # + # Check input validity + check_input_validity() if arguments["index"]: mapping_index_path = arguments["index"] else: mapping_index_path = join(tmp, "bowtie2_index") os.makedirs(mapping_index_path, exist_ok=True) - # RUN - try: - batch_json = json.dump({"commandline":arguments, "summary":[],"kraken":{}, "versions":Pipeline().__dict__}, open(join(arguments["output"], "QCResults/Report/src", "summary.json"),"w")) - i = 1 - project = [] - tmp_overview = open(join(tmp, "overview.csv"), "w") - tmp_overview.write("Sample,Trimmed,Mapped,Classified\n") - myFiles = [] - - #filter samples and put it into for loop - # Get folder and filter by last three pattern splitted by "_" - if arguments["input"]: - input = [ join(arguments["input"], x) for x in sorted(os.listdir(arguments["input"]))] - #### check input format #### - # Case 1: Folder contains fastq files - if len([x for x in input if isfile(x) and any([x.endswith(ext) for ext in file_extensions]) ])>0: - input= [x for x in input if isfile(x) and any([x.endswith(ext) for ext in file_extensions]) ] - if len(input)!=0: - if arguments["technology"].startswith("Illumina"): - for setname, files in groupby(input, key=lambda x: "_".join(x.split("_")[:-3])): - #for setname, files in groupby(files, key=lambda x: "_".join(x.split("_")[:-2])): - temp_file = sorted(list(files)) - r1_reads = [x for x in temp_file if r1_pattern in x] - r2_reads = [x for x in temp_file if r2_pattern in x] - if len(r1_reads) != 0: - if len(r2_reads) != 0: - myFiles.append([r1_reads, r2_reads]) - else: - myFiles.append(r1_reads) - else: - # IonTorrent - myFiles = input - # Case 2: Folder contains subfolder for each sample - else: - input = [x for x in input if os.path.isdir(x)] - - if len(input) > 0: - for sample in input: - files = sorted([join(sample,x) for x in listdir(sample) if any([x.endswith(ext) for ext in file_extensions])]) - if arguments["technology"].startswith("Illumina"):# and len(files) > 1: - for setname, files in groupby(files, key=lambda x: "_".join(x.split("_")[:-3])): - temp_file = sorted(list(files)) - r1_reads = [x for x in temp_file if r1_pattern in x] - r2_reads = [x for x in temp_file if r2_pattern in x] - if len(r1_reads)!=0: - if len(r2_reads)!=0: - myFiles.append([r1_reads, r2_reads]) - else: - myFiles.append(r1_reads) - else: - myFiles.append(sorted(list(files))) - files = None - if len(myFiles) == 0: - sys.exit("Enter a valid folder, which contain sample folders") - else: - if arguments["r1"]: - if not os.path.exists(arguments["r1"]): - sys.exit(arguments["r1"] + " does not exist.") - if arguments["r2"]: - myFiles.append([arguments["r1"], arguments["r2"]]) - else: - myFiles.append([arguments["r1"]]) + myFiles = get_input() + print("Found " + str(len(myFiles)) + " sample(s).") - print("Found " + str(len(myFiles)) + " sample(s).") + # + # Get parameter defaults + get_defaults() + batch_json = json.dump({ + "commandline": arguments, + "summary": [], + "kraken": {}, + "versions": Pipeline().__dict__}, + open(join(arguments["output"], "QCResults/Report/src", "summary.json"), "w")) + + output = join(arguments["output"], "QCResults") # getDir([arguments["output"], "QCResults"], True) + + i = 1 + ####### + # Run for each sample + ####### + try: for subset in myFiles: print("Analyse " + str(i) + "/" + str(len(myFiles))) - sample_tmp = join(tmp, "Sample_" + str(i)) os.makedirs(sample_tmp, exist_ok=True) @@ -375,11 +427,17 @@ def main(arguments): if len(subset)>1: arguments["r2"] = subset[1] - sample = fillClasses( mapping_index_path, sample_tmp ) + #### + # Run Analyses + #### + sample = runAnalyses( mapping_index_path, sample_tmp ) + + #### + # Write output + #### if sample is not None: - project.append(sample) try: - writeReport(sample, arguments) + writeReport(sample) except: print("Couldnt write pdf.") try: @@ -396,88 +454,43 @@ def main(arguments): else: kraken = sample.krakenRes.pClassified batch_json["kraken"][sample.name] = json.loads(str_to_json(sample.krakenRes.report)) - tmp_overview.write(",".join([sample.name, str(sample.nTrimmed()), str(nMapping), str(kraken)]) +"\n") + #tmp_overview.write(",".join([sample.name, str(sample.nTrimmed()), str(nMapping), str(kraken)]) +"\n") + try: + trim_param = sample.readSets[0].trimRes.print_param(arguments["palindrome"], arguments["minlen"], + arguments["trimOption"]) + except: + trim_param="None" batch_json["summary"].append( OrderedDict([("setname", sample.name), - ("Total reads [#]", sample.total_reads()), - ("Reads after trimming [#]", sample.nTrimmed()), - ("Reads after trimming [%]", sample.pTrimmed()), + ("Total reads [#]", sample.total_reads()), + ("Reads after trimming [#]", sample.nTrimmed()), + ("Reads after trimming [%]", sample.pTrimmed()), ("Map to reference [#]", nMapping), ("Map to reference [%]", pMapping), ("Classified (Kraken)", kraken), ("Shorter fragments", sample.get_mean_trimRes()), - ("Trim Parameter", sample.readSets[0].trimRes.print_param(arguments["palindrome"],arguments["minlen"],arguments["trimOption"])), + ("Trim Parameter",trim_param), ("images", [x.get_all_qc_dict() for x in sample.readSets])])) json.dump(batch_json, open(join(arguments["output"], "QCResults/Report/src", "summary.json"), "w")) except: print("Couldnt produce HTML report.") + sample = None + arguments["r1"]=None + arguments["r2"]=None i+=1 + # plot boxplots and barplots + plot() + writeSummaryTex(batch_json) + shutil.copy(join(os.path.dirname(__file__), "batch_report.html"), join(arguments["output"], "QCResults", "Report", "batch_report.html")) - try: - # plot boxplots - boxplots = [{"file":"Per_sequence_quality_scores.csv", - "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_quality_scores.png"), - "title": "Per sequence quality scores", - "ylab": "Mean Sequence Quality (Phred Score)", - "xlab": "Sample" }, - {"file": "Sequence_Length_Distribution.csv", - "output": join(arguments["output"], "QCResults/Report/src/img", "Sequence_Length_Distribution.png"), - "title": "Sequence Length Distribution", - "ylab": "Sequence Length (bp)", - "xlab": "Sample"}, - {"file": "Per_sequence_GC_content.csv", - "output": join(arguments["output"], "QCResults/Report/src/img", "Per_sequence_GC_content.png"), - "title": "Per sequence GC content", - "ylab": "Mean GC content (%)", - "xlab": "Sample"} ] - for plot in boxplots: - process = subprocess.Popen(" ".join(["Rscript --vanilla ",join(os.path.dirname(__file__) ,"boxplot.R"), - join(arguments["output"],"QCResults", "Report", "src", plot["file"]), plot["output"], '"'+ plot["title"]+'"', '"'+plot["xlab"]+'"','"'+ plot["ylab"]+'"']), - stderr=subprocess.PIPE, stdout= subprocess.PIPE, shell =True) - for line in iter(process.stderr.readline, b''): - print(line) - process.communicate() - #plot barplots - tmp_overview.close() - process = subprocess.Popen(" ".join(["Rscript --vanilla ", join(os.path.dirname(__file__), "barplot.R"),join(tmp, "overview.csv"), - join(arguments["output"], "QCResults/Report/src/img")]),stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) - for line in iter(process.stderr.readline, b''): - print(line) - process.communicate() - shutil.copy(join(os.path.dirname(__file__), "batch_report.html"), join(arguments["output"], "QCResults", "Report", "batch_report.html")) - except: - print("Couldnt plot boxplots.") - try: - summary_latex_name = join(arguments["output"], "QCResults", "Report", "summary.tex") - with open(summary_latex_name, "w") as summary_latex: - summary_latex.write("\\rowcolors{2}{gray!10}{}\n\\begin{table}[H]\\begin{tabular}{lcc}\n") - summary_latex.write( - "\\rowcolor{tableBlue} \\textbf{Sample} & \\textbf{Map to reference [\#]} & \\textbf{Map to reference[\%]} & \\textbf{Reads after trimming [\#]} & \\textbf{Reads after trimming [\%]} & \\textbf{Classified (Kraken)} & \\textbf{Shorter fragments [\%]} \\ \n") - - for i in range(0, len(batch_json["summary"])): - summary_latex.write( - str(batch_json["summary"][i]["setname"]) + " & " + - str(batch_json["summary"][i]["Map to reference [#]"]) + " & " + - str(batch_json["summary"][i]["Map to reference [%]"]) + " & " + - str(batch_json["summary"][i]["Reads after trimming [#]"]) + " & " + - str(batch_json["summary"][i]["Reads after trimming [%]"]) + " & " + - str(batch_json["summary"][i]["Classified (Kraken)"]) + " & " + - str(batch_json["summary"][i]["Shorter fragments"]) + "\\\\\n" - ) - summary_latex.write("\\end{tabular}\n") - summary_latex.write("\\caption{Statistics on reads after trimming. }") - summary_latex.close() - except: - - print("Couldnt write summary.tex file.") finally: shutil.rmtree(tmp) delete_files(join(arguments["output"],"QCResults"), "FastQC", "_fastqc.html") delete_files(join(arguments["output"],"QCResults"), "Trimmed/FastQC", "_fastqc.html") - + if __name__ == "__main__": config = configparser.ConfigParser() config.read(join(dirname(__file__), "config.txt")) @@ -492,7 +505,7 @@ if __name__ == "__main__": parser.add_argument( '-2', dest='r2', help = "input file", required=False) parser.add_argument('-output', dest='output', default="") - parser.add_argument('-technology', dest='technology',choices=['Illumina',"IonTorrent"] ,required = True) + parser.add_argument('-technology', dest='technology',choices=['Illumina',"IonTorrent"] ,required=False , help = "If not set, automatically determine technology and search for fastq and bam files. Set technology to IonTorrent if all files are bam-files, else set technology to Illumina.") parser.add_argument('-threads', dest='threads', default = 4, type = int, help= "Number of threads. Default: %(default)s") parser.add_argument('-adapter', dest = 'adapter', choices = ['TruSeq2-PE', 'TruSeq2-SE' , 'TruSeq3-PE' , 'TruSeq3-SE' , 'TruSeq3-PE-2', 'NexteraPE-PE' ]) parser.add_argument('-reference', dest='reference', required = False, help = "Map reads against reference") @@ -503,16 +516,25 @@ if __name__ == "__main__": parser.add_argument('-db', dest='db', help='Kraken database', required = False, default= kraken_db) parser.add_argument('-palindrome', dest='palindrome', default=30, help= 'Use palindrome parameter 30 or 1000 for further analysis. Default: %(default)s', type = int) parser.add_argument('-minlen', default=50,dest='minlen',help='Minlen parameter for Trimmomatic. Default: %(default)s', type=int) - parser.add_argument('-trimOption', dest="trimOption", help='Use maxinfo or slidinginfo for Trimmomatic.MAXINFO:<target length>:<strictness> | SLIDINGWINDOW:<window size>:<required quality>. default: SLIDINGWINDOW:4:15', type= str) + parser.add_argument('-trimOption', dest="trimOption", help='Use maxinfo or slidinginfo for Trimmomatic.MAXINFO:<target length>:<strictness> | SLIDINGWINDOW:<window size>:<required quality>.', type= str) parser.add_argument('-version', action='version', version='%(prog)s v' + __version__) parser.add_argument('-nokraken', action = "store_true") parser.add_argument('-nomapping', action = "store_true") + parser.add_argument('-notrimming', action = "store_true") parser.add_argument('-gz', action = "store_true") parser.add_argument('-trimBetter', action = "store_true", help= "Optimize trimming parameter using 'Per sequence base content' from fastqc. Not recommended for amplicons.") parser.add_argument('-trimBetter_threshold', dest='trimBetter_threshold', help = "Set -trimBetter to use this option.Default setting for Illumina: 0.15 and for IonTorrent: 0.25.", required=False, type=float) parser.add_argument('-forAssembly', action = "store_true", help = "Set -trimBetter to use this option.Trim parameter are optimized for assemblies (trim more aggressive).") parser.add_argument('-forMapping', action = "store_true", help = "Set -trimBetter to use this option.Trim parameter are optimized for mapping (allow more errors).") parser.add_argument('-rename', dest="rename", required=False, help="TSV File with two columns: <old sample name> <new sample name>") + parser.add_argument('-parameters', dest = 'parameters', default = join(dirname(__file__), "parameter.txt")) arguments = vars(parser.parse_args()) + parameter = configparser.ConfigParser(strict=False, interpolation=None) + parameter.read(arguments['parameters']) + + r1_pattern = parameter["Pattern"]["R1"] + r2_pattern = parameter["Pattern"]["R2"] + sep_pattern = parameter["Pattern"]["sep"] + lane_pattern = parameter["Pattern"]["lane"] main(arguments) diff --git a/barplot.R b/barplot.R index 350342c..93dfd20 100755 --- a/barplot.R +++ b/barplot.R @@ -1,29 +1,45 @@ options(warn=-1) -library(ggplot2) +require(jsonlite) +require(ggplot2) + args = commandArgs(trailingOnly=TRUE) -summary<- as.data.frame(read.csv(args[1])) +convert2filename<- function(string, ext=".png"){ + string<- gsub("\\[%\\]", "percentage", string) + string<- gsub("\\[#\\]", "number", string) + string<- gsub(" ", "_", string) + return(paste(string, ext, sep="")) +} -ggplot(summary, aes(x=Sample, y = Trimmed )) + - geom_bar(stat = "identity", fill="#67a9cf") + - theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5)) + - ggtitle("Number of reads after trimming") + - xlab("Sample")+ - ylab("Reads [#]") -ggsave(paste(args[2], "/trimming.png", sep="")) +summary_json<- jsonlite::fromJSON(args[1])$summary +tablenames<- names(summary_json)[!names(summary_json) %in% c("images", "Trim Parameter")] -ggplot(summary, aes(x=Sample, y = Mapped )) + - geom_bar(stat = "identity", fill="#67a9cf") + - theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5)) + - ggtitle("Number of reads mapped to reference") + +summary_json<- summary_json[tablenames] +#summary<- as.data.frame(read.csv(args[1])) +for( i in tablenames[2:length(tablenames)]){ + +ggplot(summary_json, aes(x=summary_json[,"setname"], y = summary_json[,i]), environment = environment()) + + geom_bar(stat = "identity", fill="#67a9cf") + + theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5), legend.position = "none") + + ggtitle(i) + xlab("Sample")+ - ylab("Reads [#]") -ggsave(paste(args[2], "/mapping.png", sep="")) + ylab(i) + ggsave(paste(args[2], convert2filename(i), sep="/")) -ggplot(summary, aes(x=Sample, y = Classified )) + - geom_bar(stat = "identity", fill="#67a9cf") + +} +temp_json <- data.frame(rbind(cbind( setname = summary_json$setname, type = "Total reads [#]", value= summary_json$`Total reads [#]`), + cbind( setname = summary_json$setname, type = "Reads after trimming [#]", value= summary_json$`Reads after trimming [#]`) + )) +temp_json$value <- as.numeric(as.character(temp_json$value)) + +ggplot(temp_json, aes(x= )) + +ggplot(temp_json, aes(x=setname, y = value, by=type, fill=type))+ + geom_bar(stat="identity",position = "identity", alpha= 0.9) + theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5)) + - ggtitle("Precentage of reads classified by Kraken") + + ggtitle("Number of Reads") + xlab("Sample")+ - ylab("Reads [%]") -ggsave(paste(args[2], "/kraken.png", sep="")) + ylab("Number of Reads") +ggsave(paste(args[2], "number_of_reads.png", sep="/")) + + diff --git a/batch_report.html b/batch_report.html index 23fc0f4..104a42b 100755 --- a/batch_report.html +++ b/batch_report.html @@ -573,18 +573,18 @@ </br> <div class="col-md-4"> - <a ng-click="selected.title='Number of reads remained after trimming'; selected.caption=''; selected.img='src/img/trimming.png'" + <a ng-click="selected.title='Number of reads remained after trimming'; selected.caption=''; selected.img='src/img/Reads_after_trimming_number.png'" data-image-id="" data-toggle="modal" data-target="#image-gallery"> - <img height="500px" ng-src="src/img/trimming.png"> + <img height="500px" ng-src="src/img/Reads_after_trimming_number.png"> </a> </div> <div class="col-md-4"> - <a ng-click="selected.title='Number of reads mapped to reference'; selected.caption=''; selected.img='src/img/mapping.png'" + <a ng-click="selected.title='Number of reads mapped to reference'; selected.caption=''; selected.img='src/img/Map_to_reference_number.png'" data-image-id="" data-toggle="modal" data-target="#image-gallery"> - <img height="500px" ng-src="src/img/mapping.png"> + <img height="500px" ng-src="src/img/Map_to_reference_number.png"> </a> </div> @@ -593,7 +593,7 @@ <a ng-click="selected.title='Percentage of reads classified by Kraken'; selected.caption=''; selected.img='src/img/kraken.png'" data-image-id="" data-toggle="modal" data-target="#image-gallery"> - <img height="500px" ng-src="src/img/kraken.png"> + <img height="500px" ng-src="src/img/Classified_(Kraken).png"> </a> </div> diff --git a/classes.py b/classes.py index bbfb42f..3066dde 100755 --- a/classes.py +++ b/classes.py @@ -79,18 +79,17 @@ class Sample(object): def get_all_qc_plots(self): qc_dict = OrderedDict([("setname", self.name),("raw",[]), ("trimmed",[])]) for rs in self.readSets: - qc_dict["raw"].append(rs.r1.qcRes.get_plots()) - qc_dict["trimmed"].append(rs.trimRes.readset.r1.qcRes.get_plots()) + qc_dict["raw"].append(rs.r1.qcRes.plots) + qc_dict["trimmed"].append(rs.trimRes.readset.r1.qcRes.plots) if (rs.r2 is not None): - qc_dict["raw"].append(rs.r2.qcRes.get_plots()) - qc_dict["trimmed"].append(rs.trimRes.readset.r2.qcRes.get_plots()) + qc_dict["raw"].append(rs.r2.qcRes.plots) + qc_dict["trimmed"].append(rs.trimRes.readset.r2.qcRes.plots) return qc_dict def total_reads(self): total = 0 for rs in self.readSets: - if rs.trimRes: - total += rs.trimRes.total + total += rs.numberOfReads return total def pTrimmed(self): @@ -112,15 +111,15 @@ class Sample(object): self.readSets.append(readSet) return self - def run_Kraken(self, db): - r1, r2 = self.get_all_reads() + def run_Kraken(self, db, trimmed): + r1, r2 = self.get_all_reads(trimmed) self.krakenRes = KrakenResult(r1,r2,db, self.name) return self - def run_Bowtie2(self, bowtie_index, save_mapping ): - r1, r2 = self.get_all_reads() + def run_Bowtie2(self, bowtie_index, save_mapping, trimmed ): + r1, r2 = self.get_all_reads(trimmed) self.mappingRes = MappingResult(self.reference, r1, r2, bowtie_index) - self.mappingRes = self.mappingRes.run_bowtie(self.threads, save_mapping) + self.mappingRes = self.mappingRes.run_bowtie(save_mapping) return self.mappingRes def get_all_reads(self, trimmed = True): @@ -133,7 +132,10 @@ class Sample(object): return r1,r2 def get_mean_trimRes(self): - return round(np.mean([x.trimRes.shorter_fragments_percentage for x in self.readSets]),2) + try: + return round(np.mean([x.trimRes.shorter_fragments_percentage for x in self.readSets]),2) + except: + return None class ReadSet: ''' Readset contains of r1 and r2''' @@ -151,7 +153,6 @@ class ReadSet: self.numberOfReads = 0 self.numberofTrimmedReads = 0 self.trimRes = None - #self.outputDir = outputDir self.paired = True if len( basename(self.r1.filename).split("_")[:-3]) > 4: self.setname = "_".join(basename(self.r1.filename).split("_")[:-3]) @@ -162,8 +163,10 @@ class ReadSet: def run_FastQC(self, output): self.r1 = self.r1.run_FastQC(output) + self.numberOfReads += self.r1.qcRes.total_reads if(self.r2 is not None): self.r2 = self.r2.run_FastQC(output) + self.numberOfReads += self.r1.qcRes.total_reads return self def run_Trimmomatic(self, adapter, palindrome, minlen, trimOption, betterTrimming,gz): @@ -177,19 +180,14 @@ class ReadSet: return qc_results def get_all_qc_dict(self): - '''qc_dict =OrderedDict( {"raw":[],"trimmed":[]} ) - qc_dict["raw"] = [[x.__dict__ for x in self.r1.qcRes.results_to_base64()]] - qc_dict["trimmed"] = [[x.__dict__ for x in self.trimRes.readset.r1.qcRes.results_to_base64()]] - if (self.r2 is not None): - qc_dict["raw"].append([x.__dict__ for x in self.r2.qcRes.results_to_base64()]) - qc_dict["trimmed"].append([x.__dict__ for x in self.trimRes.readset.r2.qcRes.results_to_base64()]) - return qc_dict''' qc_dict = OrderedDict({"raw": [], "trimmed": []}) qc_dict["raw"] = [[x.__dict__ for x in self.r1.qcRes.results_for_report()]] - qc_dict["trimmed"] = [[x.__dict__ for x in self.trimRes.readset.r1.qcRes.results_for_report()]] + if self.trimRes: + qc_dict["trimmed"] = [[x.__dict__ for x in self.trimRes.readset.r1.qcRes.results_for_report()]] if (self.r2 is not None): qc_dict["raw"].append([x.__dict__ for x in self.r2.qcRes.results_for_report()]) - qc_dict["trimmed"].append([x.__dict__ for x in self.trimRes.readset.r2.qcRes.results_for_report()]) + if self.trimRes: + qc_dict["trimmed"].append([x.__dict__ for x in self.trimRes.readset.r2.qcRes.results_for_report()]) return qc_dict @@ -202,7 +200,6 @@ class FastQFile: self.concat_files = None def run_FastQC(self, outputDir): - print("Run fastqc in fastqfile ", " ".join([join(get_tool_path('fastqc'), "fastqc") , self.filename , "-o" , outputDir, "-t", str(num_threads), "--extract"])) process = subprocess.Popen([join(get_tool_path('fastqc'), "fastqc") , self.filename , "-o" , outputDir, "-t", str(num_threads), "--extract"], stdout = subprocess.PIPE, stderr = subprocess.PIPE) for line in iter(process.stderr.readline, b''): line = line.decode("utf-8") @@ -217,7 +214,6 @@ class FastQFile: pattern = re.match("Quality encoding detected as (?P<phred>\w+\d{2})", line) if pattern: self.phred = pattern.group("phred") - print("") process.communicate() self.qcRes = QCResult(join(outputDir, getFilenameWithoutExtension(basename(self.filename)) + "_fastqc"), join(mainResultsPath, "Report", "src")) return self @@ -231,8 +227,8 @@ class QCResult: def __init__(self, resultsPath, summary_output): self.path = resultsPath self.results = self.get_results() - #write summary csv for boxplots - write_fastqc_summary(resultsPath, summary_output) + #write summary csv for boxplots and assign total number of reads + self.total_reads = write_fastqc_summary(resultsPath, summary_output) def get_results(self): summary_list = [] @@ -254,36 +250,6 @@ class QCResult: summary_list.append(Plot("NotExisting", row[1], "NotExisting")) return summary_list - def get_plots(self): - print("extracting data ", self.path) - store = False - data = {"name": basename(self.path).replace("_fastc",""), - "Sequence_Length_Distribution": [], - "Per_sequence_GC_content": [], - "Per_base_sequence_quality": []} - - with open(join(self.path, "fastqc_data.txt"), "r") as fastqcdatafile: - box_values = [] - for line in iter(fastqcdatafile): - if line.startswith("#"): - pass - elif line.startswith(">>END_MODULE"): - if store==True: - data[key] = get_distribution(box_values, key) - store = False - key = None - box_values = [] - elif line.startswith(">>"): - key = line.split("\t")[0] - key=key.replace(">>","") - key = key.replace(" ","_") - if key in data.keys(): - store = True - else: - if store: - box_values.append(line) - return data - def results_to_base64(self): converted_plot = [] for plot in self.results: @@ -357,6 +323,7 @@ class TrimResult: self.logs = "" #Results + self.total=0 self.shorter_fragments_percentage = 0 self.input_number = 0 self.nTrimmed = 0 @@ -454,6 +421,8 @@ class TrimResult: process = subprocess.Popen(" ".join(call), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) survived1, total, nTrimmedReads, pTrimmedReads, log = trimmomatic_report(process, call[0], True) process.communicate() + if survived1==0: + sys.exit("No reads remained after trimming. Check fastqc results before trimming in QCResults/Fastqc. Please rerun without -trimBetter or check -trimOptions and minlen again.") self.total = total self.nTrimmed = nTrimmedReads @@ -521,17 +490,18 @@ class MappingResult: index_name = join(index, getFilenameWithoutExtension(basename(reference)) +"_bt2") # reference name "bowtie2 build index_name" if not isdir(index): return index - process = subprocess.Popen([join(get_tool_path("bowtie2"),"bowtie2-build"),reference, index_name ], stderr=subprocess.PIPE, stdout = subprocess.PIPE) + if not exists(index_name + ".1.bt2"): + process = subprocess.Popen([join(get_tool_path("bowtie2"),"bowtie2-build"),reference, index_name ], stderr=subprocess.PIPE, stdout = subprocess.PIPE) - for line in iter(process.stderr.readline, b''): - line = line.decode("utf-8") - print(line) - process.communicate() - print("Bowtie2-build. Done.") + for line in iter(process.stderr.readline, b''): + line = line.decode("utf-8") + print(line) + process.communicate() + print("Bowtie2-build. Done.") return index_name - def run_bowtie(self, threads, save_mapping): + def run_bowtie(self, save_mapping): print("Run Bowtie2") call = [join(get_tool_path("bowtie2"), "bowtie2"), "-x", self.index, "--no-unal --threads ", str(num_threads)] diff --git a/config.txt b/config.txt index 124f887..258a35a 100755 --- a/config.txt +++ b/config.txt @@ -1,10 +1,10 @@ [DEFAULT] -kraken_db =/opt/common/pipelines/databases/minikraken_20141208/ +kraken_db = /opt/common/pipelines/databases/minikraken_20141208/ [PATH] kraken = -adapter = /data/GS/tools/Trimmomatic-0.32/adapters/ +adapter = /opt/common/pipelines/tools/Trimmomatic-0.36/adapters/ fastqc = /opt/common/pipelines/tools/FastQC_0.11.5/ -trimmomatic = +trimmomatic = /opt/common/pipelines/tools/Trimmomatic-0.36/ bowtie2 = diff --git a/helper.py b/helper.py index d7fb524..c849e5a 100755 --- a/helper.py +++ b/helper.py @@ -151,7 +151,7 @@ def optimize_trimming(fastqc_folder,perc=0.1): column[nucl].mask[-(i + 5):(min(len(mytable[nucl]), len(mytable[nucl]) - i))] = True if (perc_slope(numpy.mean(mytable[nucl][-(i + 6): (min(len(mytable[nucl]) - 1, len(mytable[nucl]) - 1 - i))]),numpy.mean(column[nucl]), perc=perc)) & (tailcrop > len(mytable[nucl]) - (i + 5)): # if perc_slope(numpy.var(mytable[nucl][-(i+6): (min(len(mytable["A"])-1, len(mytable[nucl])-1-i))]), numpy.mean(numpy.var(column)), perc=perc): - print("TAILCROP", tailcrop) + #print("TAILCROP", tailcrop) tailcrop = len(mytable[nucl]) - (i + 5) trim_bool = True else: @@ -215,9 +215,9 @@ def delete_files(path, extended="", pattern="_fastqc"): else: shutil.rmtree(filename) - +''' def get_distribution(lines, key): - '''creates CSV used for boxplotting''' + #creates CSV used for boxplotting output = [] for line in lines: line = line.strip() @@ -235,7 +235,7 @@ def get_distribution(lines, key): v = fields[0] # quality or gc [%] output.extend(int(v) * [int(float(fields[1]))]) return output - +''' def createCSV(dataset, lines, key, path, trimmed): '''creates CSV used for boxplotting''' #output = [] @@ -280,6 +280,8 @@ def write_fastqc_summary(fastqcdir, output): if line.startswith("#"): pass # print("comment") # continue + elif line.startswith("Total Sequences"): + total_seq = int(line.split("\t")[-1]) elif line.startswith('>>END_MODULE'): if len(data) > 0: name1 = basename(fastqcdir) @@ -300,6 +302,7 @@ def write_fastqc_summary(fastqcdir, output): if store: data.extend([line]) fastqcdatafile.close() + return total_seq def write_SAV(folder, output): print("Plot SAV images...") diff --git a/parameter.txt b/parameter.txt index 14aeca2..ea2902e 100755 --- a/parameter.txt +++ b/parameter.txt @@ -35,13 +35,12 @@ trimOption = SLIDINGWINDOW:4:15 trimBetter_threshold = 0.25 trimOption = SLIDINGWINDOW:4:15 -[Adapter] -Illumina Universal Adapter = TruSeq2 - [Fileextension] -fastq = .fastq.gz, .fastq, .fq, .fq.gz, .bam +fastq = .fastq.gz, .fastq, .fq, .fq.gz +bam = .bam [Pattern] -R1 = _R1_ -R2 = _R2_ -lane = _L\d{3}_ \ No newline at end of file +sep=_ +R1 = R1 +R2 = R2 +lane = L\d{3} \ No newline at end of file diff --git a/report.tex b/report.tex index e687aa0..65558cc 100755 --- a/report.tex +++ b/report.tex @@ -67,9 +67,11 @@ Processed reads: \\ %-------------------- Summary -------------------% \begin{tcolorbox} {\large{Summary} } \\ -{{sample.nTrimmed()}} of {{sample.total_reads()}} ({{sample.pTrimmed()}}\%) reads remained after trimming \\ +{% if sample.nTrimmed() != 0 %} {{sample.nTrimmed()}} of {{sample.total_reads()}} ({{sample.pTrimmed()}}\%) reads remained after trimming \\ {{sample.get_mean_trimRes()}} \% of fragments were shorter than read length\\ - +{% else %} +No trimming was performed \\ +{% endif %} {%if sample.mappingRes !=None%}{{sample.mappingRes.numberOfAlignedReads}} ({{sample.mappingRes.percentAlignedReads}}\%) of reads aligned to \path{ {{sample.reference}} }\\ {%endif%} {%if sample.krakenRes !=None%}{{sample.krakenRes.nClassified}} sequences classified ({{sample.krakenRes.pClassified}}\%) {%endif%} @@ -117,12 +119,13 @@ Trimming Log: \\ {\includegraphics[width=\textwidth]{/{{read.r1.qcRes.results[i].file}}} } \end{subfigure} \end{adjustbox}\qquad - \begin{adjustbox}{minipage=0.4\textwidth-2\fboxrule, cframe = {{read.trimRes.readset.r1.qcRes.results[i].color}} 2}% - \begin{subfigure}% + {% if sample.nTrimmed() != 0 %} \begin{adjustbox}{minipage=0.4\textwidth-2\fboxrule, cframe = {{read.trimRes.readset.r1.qcRes.results[i].color}} 2}% + \begin{subfigure}% {\includegraphics[width=\textwidth]{/{{read.trimRes.readset.r1.qcRes.results[i].file}}} } \end{subfigure} \end{adjustbox} \caption{ {{read.trimRes.readset.r1.qcRes.results[i].name}}} + {% endif %} \end{figure} {% endfor %} @@ -145,12 +148,13 @@ Concatenated files: \\ {\includegraphics[width=\textwidth]{/{{read.r2.qcRes.results[i].file}}} } \end{subfigure} \end{adjustbox}\qquad - \begin{adjustbox}{minipage=0.4\textwidth -2\fboxrule, cframe = {{read.trimRes.readset.r2.qcRes.results[i].color}} 2}% + {% if sample.nTrimmed() != 0 %} \begin{adjustbox}{minipage=0.4\textwidth -2\fboxrule, cframe = {{read.trimRes.readset.r2.qcRes.results[i].color}} 2}% \begin{subfigure}% {\includegraphics[width=\textwidth]{/{{read.trimRes.readset.r2.qcRes.results[i].file}}} } \end{subfigure} \end{adjustbox} \caption{ {{read.trimRes.readset.r2.qcRes.results[i].name}}} + {% endif %} \end{figure} {% endfor %} {% endif %} -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/qcumber.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
