Commit f79a0774 authored by Audrey Gibert's avatar Audrey Gibert

[Sequel_qc]

Beginning the transformation
parent 8645bb68
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import logging
from ng6.ng6workflow import NG6Workflow
from ng6.utils import Utils
class SequelQualityCheck (NG6Workflow):
def __init__(self):
"""Classe héritant de NG6Workflow"""
def get_name(self):
return 'sequel_qc'
def get_description(self):
return "Sequel II PacBio data loading and quality check"
def define_parameters(self, function="process"):
logging.getLogger("jflow").debug("SequelQC | SequelQualityCheck.define_parameters!")
self.add_parameter("nb_threads", "Number of threads to use for fastqc. Each thread will be allocated 250MB of memory.", default=3)
self.add_parameter("min_subreads_length", "Subreads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=0, type='int')
self.add_parameter("polymerase_read_qual", "Polymerase reads with lower quality than this value are filtered out and excluded from analysis", default=0, type='float')
self.add_parameter("polymerase_read_length", "Polymerase reads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=0, type='int')
self.add_input_file( "barcode_file", "Input barcode file", default=None)
self.add_parameter("barcode_score", "Min identical base for barcode", default=22, type='int')
def process(self):
logging.getLogger("jflow").debug("SequelQC | SequelQualityCheck.process started!")
sample_names = []
infiles = []
for sample in self.samples :
sample_names.append( sample.name )
infiles.append(sample.reads1[0])
add_pacbio_raw_file = self.add_component("AddPacBioRawFiles", [self.runobj, self.get_all_reads()])
h5tofastq = self.add_component("H5toFastq", [sample_names, infiles])
fastqc = self.add_component("FastQC", [h5tofastq.output_fastqs, False, False, "fastqc.tar.gz", self.nb_threads], parent = h5tofastq)
self.add_component("RS_Subreads", [sample_names, infiles,self.min_subreads_length,self.polymerase_read_qual,self.polymerase_read_length,self.barcode_file,self.barcode_score ])
logging.getLogger("jflow").debug("SequelQC | SequelQualityCheck.process ended!")
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import pickle
from jflow.component import Component
from workflows.pacbio_qc.lib.pacbiolib import h5file
from weaver.function import PythonFunction
def add_pacbio_raw_files(run_dump_path, tempdir, stdoutfile):
import pickle
import os
import gzip
import sys
from workflows.pacbio_qc.lib.pacbiolib import PacbioH5Reader
from jflow.utils import display_info_message
with open(stdoutfile, 'w') as fhout :
# --- add_pacbio_raw_files ---
my_run = pickle.load(open(run_dump_path, "rb"))
files_to_save = []
nb_seq, full_size = 0, 0
for ifile in my_run.raw_files:
analysisresults_dir = os.path.dirname(ifile)
celldir = os.path.dirname(analysisresults_dir)
if ifile not in files_to_save :
# total sequence length
reader = PacbioH5Reader(ifile)
h5 = reader.bash5
for name, description, sequence, qualities in reader :
nb_seq += 1
full_size += len(sequence)
for partfile in h5.parts :
if partfile not in files_to_save :
files_to_save.append(partfile.filename)
# it's a bas.h5
if h5.filename and h5.filename not in files_to_save:
files_to_save.append(h5.filename)
# copy .metadata.xml
if reader.metadata :
if reader.metadata not in files_to_save :
files_to_save.append(reader.metadata)
else :
display_info_message("Warning : no metadata file found for input file : %s "%ifile)
fhout.write("nb_seq : ")
fhout.write(str(nb_seq)+"\n")
fhout.write("full_size : ")
fhout.write(str(full_size)+"\n")
fhout.write("Files to save : \n")
fhout.write("\n".join(files_to_save) )
my_run.set_nb_sequences(nb_seq)
my_run.set_full_size(full_size)
my_run.archive_files(files_to_save, "none")
my_run.sync()
class AddPacBioRawFiles (Component):
def define_parameters(self, runobj, input_files):
self.runobj = runobj
self.add_input_file_list( "input_files", "File to be saved as raw files", default=input_files, file_format = h5file, required=True)
self.add_output_file("stdout", "AddPacBioRawFiles stdout", filename="AddPacBioRawFiles.stdout")
def process(self):
self.runobj.raw_files = self.input_files
run_dump_path = self.get_temporary_file(".dump")
pickle.dump(self.runobj, open(run_dump_path, "wb"))
addraw = PythonFunction(add_pacbio_raw_files, cmd_format='{EXE} {ARG} {OUT}')
addraw(outputs=self.stdout, includes=self.input_files, arguments=[run_dump_path, self.config_reader.get_tmp_directory()])
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from workflows.pacbio_qc.lib.pacbiolib import h5file
from jflow.abstraction import MultiMap
from ng6.analysis import Analysis
from weaver.function import PythonFunction
def h5_to_fastq(input_file, fastqfile):
import gzip
from workflows.pacbio_qc.lib.pacbiolib import PacbioH5Reader
# generate fastq.gz file
f_out = gzip.open(fastqfile, 'wb')
reader = PacbioH5Reader(input_file)
for name, description, sequence, qualities in reader :
f_out.write(str("@"+name+"\n").encode())
f_out.write(sequence+"\n".encode())
f_out.write("+\n".encode())
f_out.write(qualities+"\n".encode())
f_out.close()
class H5toFastq (Analysis):
def define_parameters(self, sample_names, input_files):
self.add_parameter_list("sample_names", "sample names, each sample name must correspond to an input file", default=sample_names, required=True)
self.add_input_file_list( "input_files", "Input pacbio bas.h5 files", default=input_files, file_format = h5file, required=True)
self.add_output_file_list( 'output_fastqs', "Output fastq files", pattern="{basename_woext}.fastq.gz", file_format="fastq", items=self.sample_names)
def process(self):
convertion = PythonFunction(h5_to_fastq, cmd_format="{EXE} {IN} {OUT}")
bwa = MultiMap(convertion, inputs=self.input_files, outputs=self.output_fastqs)
def define_analysis(self):
self.name = "H5toFastq"
self.description = "Extract subreads of pacbio bas.h5 files to fastq files"
self.software = "Python"
self.options = ""
def post_process(self):
self._save_files(self.output_fastqs)
def get_version(self):
return "1.0"
\ No newline at end of file
This diff is collapsed.
>ORF1_Ctrl
GTGCGTATGTCGCTAC
>ORF1_17
GTACATATGCGTCTGT
>ORF1_18
GAGACTAGAGATAGTG
>ORF1_19
TACGCGTGTACGCAGA
>ORF1_20
TGTCACTCATCTGAGT
>ORF1_21
GCACATACACGCTCAC
>ORF1_22
GCTCGTCGCGCGCACA
>ORF1_23
ACAGTGCGCTGTCTAT
>ORF1_24
TCACACTCTAGAGCGA
>ORF1_25
TCACATATGTATACAT
>ORF1_26
CGCTGCGAGAGACAGT
>ORF1_27
ACACACAGACTGTGAG
>ORF1_28
GCAGACTCTCACACGC
>ORF1_29
TGCTCTCGTGTACTGT
>ORF1_30
GTGTGAGATATATATC
>ORF1_31
CTCAGTGTGACACATG
>ORF1_32
TGCGAGCGACTCTATC
>ORF1_33
GTCAGCTAGTGTCAGC
>ORF1_34
AGATATCATCAGCGAG
>ORF1_35
GTGCAGTGATCGATGA
>ORF1_36
TGACTCGCTCATAGTC
>ORF1_37
ATGCTGATGACGCGCT
>ORF1_38
GACAGCATCTGCGCTC
>ORF1_39
AGCGTCTGACGTGAGT
>ORF1_40
TCGATATACGACGTGC
>ORF1_41
TCGTCATACGCTCTAG
>ORF1_42
CGACTACGTACAGTAG
>ORF1_43
GCGTAGACAGACTACA
>ORF1_44
ACAGTATGATGTACTC
>ORF1_45
GTCTGATAGATACAGA
>ORF1_46
CTGCGCAGTACGTGCA
>ORF1_47
TAGATCTCTGACTCAC
>ORF1_48
CTGATGCGCGCTGTAC
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
--date
18/03/2015
--data-nature
DNA
--sequencer
Pacbio RS II
--name
pacbio_qc
--type
Unknown
--species
lambda
--description
pacbio demo workflow
--project-name
test-dev
--sample
sample-name=Pacbio_sample
read1=workflows/pacbio_qc/data/primary/lambda_v210/Analysis_Results/m130802_062611_ethan_c100542982550000001823084811241306_s1_p0.bas.h5
--min-subreads-length
0
--polymerase-read-qual
0
--polymerase-read-length
0
--barcode-file
workflows/pacbio_qc/data/barcode.fasta
--barcode-score
22
<?xml version="1.0" encoding="utf-8"?><Metadata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns="http://pacificbiosciences.com/PAP/Metadata.xsd"><InstCtrlVer>2.1.0.0.125472</InstCtrlVer><SigProcVer>NRT@172.31.128.10:8082, SwVer=2100.125472, HwVer=1.0</SigProcVer><Run><RunId>r000938_ethan_130801</RunId><Name>B4_PSS_2kblambda_P4C2_std45_080113</Name><WhenCreated>2013-08-01T14:14:37</WhenCreated><WhenStarted>2013-08-01T22:16:17</WhenStarted></Run><Movie><WhenStarted>2013-08-02T06:33:36.525058+00:00</WhenStarted><DurationInSec>2700</DurationInSec><Number>0</Number></Movie><Sample><Name>2kb lambda</Name><PlateId>B4_PSS_2kblambda_P4C2_std45_080113</PlateId><WellName>A03</WellName><Concentration>0</Concentration><SampleReuseEnabled>false</SampleReuseEnabled><StageHotstartEnabled>false</StageHotstartEnabled><SizeSelectionEnabled>false</SizeSelectionEnabled><UseCount>1</UseCount><Comments>B4_PSS_2kblambda_P4C2_std45_080113</Comments></Sample><InstrumentId>1</InstrumentId><InstrumentName>ethan</InstrumentName><CollectionProtocol>Standard Seq v3</CollectionProtocol><CollectionNumber>7</CollectionNumber><CellIndex>6</CellIndex><SetNumber>1</SetNumber><EightPac><PartNumber>0018</PartNumber><LotNumber>230848</LotNumber><Barcode>10054298255000000182308481124130</Barcode><ExpirationDate>2013-11-24</ExpirationDate></EightPac><TemplatePrep><Name>DNA Template Prep Kit 2.0 (250bp - 3Kb)</Name><PartNumber>001540726</PartNumber><LotNumber>000001</LotNumber><Barcode>000001001540726123115</Barcode><ExpirationDate>2015-12-31</ExpirationDate><AdapterSequence>ATCTCTCTCttttcctcctcctccgttgttgttgttGAGAGAGAT</AdapterSequence><InsertSize>2000</InsertSize></TemplatePrep><BindingKit><Name>DNA/Polymerase Binding Kit P4</Name><PartNumber>100236500</PartNumber><LotNumber>000001</LotNumber><Barcode>000001100236500123115</Barcode><ExpirationDate>2015-12-31</ExpirationDate><Control /><IsControlUsed>false</IsControlUsed></BindingKit><SequencingKit><Name>ReagentPlate0</Name><PartNumber>001558034</PartNumber><LotNumber>002648</LotNumber><Barcode>002648345001558034011414</Barcode><ExpirationDate>2014-01-14</ExpirationDate><Protocol>C2ReagentMixingProtocol_DWP</Protocol></SequencingKit><ReagentTube0><Name>ReagentTube0-0</Name><PartNumber>001028310</PartNumber><LotNumber>002573</LotNumber><Barcode>002573414001028310090715</Barcode><ExpirationDate>2015-09-07</ExpirationDate></ReagentTube0><ReagentTube1><Name>ReagentTube0-1</Name><PartNumber>100192000</PartNumber><LotNumber>020713</LotNumber><Barcode>020713056100192000123115</Barcode><ExpirationDate>2015-12-31</ExpirationDate></ReagentTube1><Primary><Protocol>BasecallerV1</Protocol><ConfigFileName>2-0-0_P4-C2.xml</ConfigFileName><ResultsFolder>Analysis_Results</ResultsFolder><CollectionPathUri>rsy://mp-f030-io/vol52/RS_DATA_STAGING/ethan/B4_PSS_2kblambda_P4C2_std45_080113_938/A03_7/</CollectionPathUri><CollectionFileCopy>Fasta</CollectionFileCopy><CollectionFileCopy>Fastq</CollectionFileCopy></Primary><Secondary><ProtocolName /><CellCountInJob>0</CellCountInJob></Secondary><Custom><KeyValue key="svc:/CentralDataSvc/#Display.Sample_Metadata.User_Defined_Field_1" label="User Defined Field 1">LIMS_IMPORT=2311813</KeyValue><KeyValue key="svc:/CentralDataSvc/#Display.Sample_Metadata.User_Defined_Field_2" label="User Defined Field 2" /><KeyValue key="svc:/CentralDataSvc/#Display.Sample_Metadata.User_Defined_Field_3" label="User Defined Field 3" /><KeyValue key="svc:/CentralDataSvc/#Display.Sample_Metadata.User_Defined_Field_4" label="User Defined Field 4" /><KeyValue key="svc:/CentralDataSvc/#Display.Sample_Metadata.User_Defined_Field_5" label="User Defined Field 5" /><KeyValue key="svc:/CentralDataSvc/#Display.Sample_Metadata.User_Defined_Field_6" label="User Defined Field 6" /></Custom><ExpirationData><EightPacPastExpiration>0</EightPacPastExpiration><ReagentKitPastExpiration>0</ReagentKitPastExpiration><ReagentTube0PastExpiration>0</ReagentTube0PastExpiration><ReagentTube1PastExpiration>0</ReagentTube1PastExpiration></ExpirationData></Metadata>
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
\ No newline at end of file
This diff is collapsed.
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from .lightpbcoreio import BasH5Reader
from jflow.seqio import Sequence
import numpy as np
import os
class PacbioH5Reader(object):
"""
Reader for pacbio movie file
"""
def __init__(self, file):
"""
@param file: either the .bas.h5 or one of the three .bax.h5 files
"""
parts = []
analysisresults_dir = os.path.dirname(file)
celldir = os.path.dirname(analysisresults_dir)
if file.endswith(".bax.h5"):
for each in os.listdir(analysisresults_dir) :
fullpath = os.path.join(analysisresults_dir, each)
if each.endswith('.bax.h5') :
parts.append(fullpath)
elif each.endswith('.bas.h5') :
parts = [fullpath]
break
elif file.endswith(".bas.h5"):
parts = [file]
self.bash5 = BasH5Reader(*parts)
self.metadata = None
for meta in os.listdir(celldir) :
metafile = os.path.join(celldir, meta)
if os.path.isfile(metafile) and meta.endswith(".metadata.xml"):
self.metadata = metafile
break
def qvsFromAscii(self,s):
return (np.fromstring(s, dtype=np.uint8) - 33)
def asciiFromQvs(self,a):
return (np.clip(a, 0, 93).astype(np.uint8) + 33).tostring()
def __iter__(self):
for zmwRead in self.bash5.subreads():
yield Sequence(zmwRead.readName, zmwRead.readName, zmwRead.basecalls(), self.asciiFromQvs(zmwRead.QualityValue()))
def h5file(ifile):
try :
reader = PacbioH5Reader(ifile)
nb_seq = 0
for id, desc, seq, qualities in reader:
nb_seq += 1
if nb_seq == 10: break
except :
raise jflow.InvalidFormatError("The provided file '" + ifile + "' is not a valid pacbio h5 file!")
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment