Commit 50286b16 authored by Claire Kuchly's avatar Claire Kuchly
Browse files

add barcode analysis

parent 46ed4630
......@@ -28,7 +28,12 @@ class PacBioQualityCheck (NG6Workflow):
def define_parameters(self, function="process"):
self.add_parameter("nb_threads", "Number of threads to use for fastqc. Each thread will be allocated 250MB of memory.", default=3)
self.add_parameter("min_subreads_length", "Subreads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=0, type='int')
self.add_parameter("polymerase_read_qual", "Polymerase reads with lower quality than this value are filtered out and excluded from analysis", default=0, type='float')
self.add_parameter("polymerase_read_length", "Polymerase reads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=0, type='int')
self.add_input_file( "barcode_file", "Input barcode file", default=None)
self.add_parameter("barcode_score", "Min identical base for barcode", default=22, type='int')
def process(self):
sample_names = []
infiles = []
......@@ -39,6 +44,6 @@ class PacBioQualityCheck (NG6Workflow):
add_pacbio_raw_file = self.add_component("AddPacBioRawFiles", [self.runobj, self.get_all_reads()])
h5tofastq = self.add_component("H5toFastq", [sample_names, infiles])
fastqc = self.add_component("FastQC", [h5tofastq.output_fastqs, False, False, "fastqc.tar.gz", self.nb_threads], parent = h5tofastq)
self.add_component("RS_Subreads", [sample_names, infiles])
self.add_component("RS_Subreads", [sample_names, infiles,self.min_subreads_length,self.polymerase_read_qual,self.polymerase_read_length,self.barcode_file,self.barcode_score ])
\ No newline at end of file
......@@ -25,7 +25,7 @@ from ng6.analysis import Analysis
from weaver.function import PythonFunction
def rs_subreads(inputfile, componentdir, smrtpipe, fofnToSmrtpipeInput, min_subreads_length, polymerase_read_qual, polymerase_read_length, stdout):
def rs_subreads(inputfile, stdout, componentdir, smrtpipe, fofnToSmrtpipeInput, min_subreads_length, polymerase_read_qual, polymerase_read_length, barcode_file, barcode_score ):
import subprocess
import os
......@@ -42,9 +42,19 @@ def rs_subreads(inputfile, componentdir, smrtpipe, fofnToSmrtpipeInput, min_subr
if not os.path.exists(outputdir) :
subprocess.check_call("mkdir %s >> %s"%(outputdir, stdout), shell=True)
barcode_string=""
if barcode_file != None :
barcode_string="""
<module id="P_Barcode">
<param name='mode'><value>symmetric</value></param>
<param name='barcode.fasta'>
<value>{0}</value>
</param>
<param name='score'><value>{1}</value></param>
</module>""".format(barcode_file, barcode_score)
# write settings
with open( settings_xml, "w") as fh:
fh.write("""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
fh.write(("""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<smrtpipeSettings>
<module id="P_Fetch" />
<module id="P_Filter" >
......@@ -59,8 +69,9 @@ def rs_subreads(inputfile, componentdir, smrtpipe, fofnToSmrtpipeInput, min_subr
</param>
</module>
<module id="P_FilterReports"/>
"""+barcode_string+"""
</smrtpipeSettings>
""".format(min_subreads_length, polymerase_read_qual, polymerase_read_length))
""").format(min_subreads_length, polymerase_read_qual, polymerase_read_length))
# write fofn and xml
reader = PacbioH5Reader(inputfile)
......@@ -92,23 +103,26 @@ class RS_Subreads (Analysis):
This module filters reads based on a minimum subread length, polymerase read quality and polymerase read length
"""
def define_parameters(self, sample_names, input_files, min_subreads_length = 0, polymerase_read_qual = 0, polymerase_read_length = 0):
def define_parameters(self, sample_names, input_files, min_subreads_length = 0, polymerase_read_qual = 0, polymerase_read_length = 0, barcode_file=None, barcode_score=22):
self.add_parameter_list("sample_names", "sample names, each sample name must correspond to an input file", default=sample_names, required=True)
self.add_input_file_list( "input_files", "Input pacbio bas.h5 files", default=input_files, file_format = h5file, required=True)
self.add_parameter("min_subreads_length", "Subreads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=min_subreads_length, type='int')
self.add_parameter("polymerase_read_qual", "Polymerase reads with lower quality than this value are filtered out and excluded from analysis", default=polymerase_read_qual, type='float')
self.add_parameter("polymerase_read_length", "Polymerase reads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=polymerase_read_length, type='int')
self.add_input_file( "barcode_file", "Input barcode file", default=barcode_file)
self.add_parameter("barcode_score", "Min identical base for barcode", default=barcode_score, type='int')
items = [os.path.basename(os.path.splitext(os.path.splitext(filename)[0])[0]) for filename in self.input_files]
self.add_output_file_list( 'stdouts', "logs", pattern="{basename}.stdout", items=items)
def process(self):
subreads = PythonFunction(rs_subreads, cmd_format="{EXE} {IN} {ARG} {OUT}")
subreads = PythonFunction(rs_subreads, cmd_format="{EXE} {IN} {OUT} {ARG}")
for i,e in enumerate(self.input_files) :
subreads(inputs = self.input_files[i],
outputs = [self.stdouts[i]],
arguments = [self.output_directory, self.get_exec_path("smrtpipe"), self.get_exec_path("fofnToSmrtpipeInput.py"),
self.min_subreads_length, self.polymerase_read_qual, self.polymerase_read_length])
self.min_subreads_length, self.polymerase_read_qual, self.polymerase_read_length, self.barcode_file, self.barcode_score ])
def get_version(self):
return "1.0"
......
>ORF1_Ctrl
GTGCGTATGTCGCTAC
>ORF1_17
GTACATATGCGTCTGT
>ORF1_18
GAGACTAGAGATAGTG
>ORF1_19
TACGCGTGTACGCAGA
>ORF1_20
TGTCACTCATCTGAGT
>ORF1_21
GCACATACACGCTCAC
>ORF1_22
GCTCGTCGCGCGCACA
>ORF1_23
ACAGTGCGCTGTCTAT
>ORF1_24
TCACACTCTAGAGCGA
>ORF1_25
TCACATATGTATACAT
>ORF1_26
CGCTGCGAGAGACAGT
>ORF1_27
ACACACAGACTGTGAG
>ORF1_28
GCAGACTCTCACACGC
>ORF1_29
TGCTCTCGTGTACTGT
>ORF1_30
GTGTGAGATATATATC
>ORF1_31
CTCAGTGTGACACATG
>ORF1_32
TGCGAGCGACTCTATC
>ORF1_33
GTCAGCTAGTGTCAGC
>ORF1_34
AGATATCATCAGCGAG
>ORF1_35
GTGCAGTGATCGATGA
>ORF1_36
TGACTCGCTCATAGTC
>ORF1_37
ATGCTGATGACGCGCT
>ORF1_38
GACAGCATCTGCGCTC
>ORF1_39
AGCGTCTGACGTGAGT
>ORF1_40
TCGATATACGACGTGC
>ORF1_41
TCGTCATACGCTCTAG
>ORF1_42
CGACTACGTACAGTAG
>ORF1_43
GCGTAGACAGACTACA
>ORF1_44
ACAGTATGATGTACTC
>ORF1_45
GTCTGATAGATACAGA
>ORF1_46
CTGCGCAGTACGTGCA
>ORF1_47
TAGATCTCTGACTCAC
>ORF1_48
CTGATGCGCGCTGTAC
......@@ -30,9 +30,18 @@ lambda
--description
pacbio demo workflow
--project-name
Demo project
test-dev
--sample
sample-name=Pacbio_sample
read1=workflows/pacbio_qc/data/primary/lambda_v210/Analysis_Results/m130802_062611_ethan_c100542982550000001823084811241306_s1_p0.bas.h5
--min-subreads-length
0
--polymerase-read-qual
0
--polymerase-read-length
0
--barcode-file
workflows/pacbio_qc/data/barcode.fasta
--barcode-score
22
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment