Commit fbad0395 authored by Penom Nom's avatar Penom Nom
Browse files

update pacbio_qc. metadata.xm or .bas.h5 files are not mandataory anymore

parent 3647fe03
......@@ -20,63 +20,73 @@ import pickle
from jflow.component import Component
from workflows.pacbio_qc.lib.pacbiolib import bash5
from workflows.pacbio_qc.lib.pacbiolib import h5file
from weaver.function import PythonFunction
def add_pacbio_raw_files(run_dump_path, tempdir):
def add_pacbio_raw_files(run_dump_path, tempdir, stdoutfile):
import pickle
import os
import gzip
import sys
from workflows.pacbio_qc.lib.pacbiolib import PacbioH5Reader
# --- add_pacbio_raw_files ---
my_run = pickle.load(open(run_dump_path, "rb"))
files_to_save = []
nb_seq, full_size = 0, 0
for file in my_run.raw_files:
# copy .bas.h5
reader = PacbioH5Reader(file)
bash5 = reader.bash5
for name, description, sequence, qualities in reader :
nb_seq += 1
full_size += len(sequence)
from jflow.utils import display_info_message
with open(stdoutfile, 'w') as fhout :
# --- add_pacbio_raw_files ---
my_run = pickle.load(open(run_dump_path, "rb"))
files_to_save = []
nb_seq, full_size = 0, 0
# copy .metadata.xml
metadata = None
celldir = os.path.dirname(os.path.dirname(file))
for meta in os.listdir(celldir) :
meta = os.path.join(celldir, meta)
if os.path.isfile(meta) and meta.endswith(".metadata.xml"):
metadata = meta
break
if metadata is None :
raise Exception("Missing metadata xml file for " + file )
for ifile in my_run.raw_files:
analysisresults_dir = os.path.dirname(ifile)
celldir = os.path.dirname(analysisresults_dir)
if ifile not in files_to_save :
# total sequence length
reader = PacbioH5Reader(ifile)
h5 = reader.bash5
for name, description, sequence, qualities in reader :
nb_seq += 1
full_size += len(sequence)
# copy .bax.h5
for partfile in bash5.parts :
files_to_save.append(partfile.filename)
files_to_save.append(file)
files_to_save.append(metadata)
for partfile in h5.parts :
if partfile not in files_to_save :
files_to_save.append(partfile.filename)
# it's a bas.h5
if h5.filename and h5.filename not in files_to_save:
files_to_save.append(h5.filename)
# copy .metadata.xml
if reader.metadata :
if reader.metadata not in files_to_save :
files_to_save.append(reader.metadata)
else :
display_info_message("Warning : no metadata file found for input file : %s "%ifile)
my_run.set_nb_sequences(nb_seq)
my_run.set_full_size(full_size)
my_run.archive_files(files_to_save, "none")
my_run.sync()
fhout.write("nb_seq : ")
fhout.write(str(nb_seq)+"\n")
fhout.write("full_size : ")
fhout.write(str(full_size)+"\n")
fhout.write("Files to save : \n")
fhout.write("\n".join(files_to_save) )
my_run.set_nb_sequences(nb_seq)
my_run.set_full_size(full_size)
my_run.archive_files(files_to_save, "none")
my_run.sync()
class AddPacBioRawFiles (Component):
def define_parameters(self, runobj, input_files):
self.runobj = runobj
self.add_input_file_list( "input_files", "File to be saved as raw files", default=input_files, file_format = bash5, required=True)
self.add_input_file_list( "input_files", "File to be saved as raw files", default=input_files, file_format = h5file, required=True)
self.add_output_file("stdout", "AddPacBioRawFiles stdout", filename="AddPacBioRawFiles.stdout")
def process(self):
self.runobj.raw_files = self.input_files
run_dump_path = self.get_temporary_file(".dump")
pickle.dump(self.runobj, open(run_dump_path, "wb"))
addraw = PythonFunction(add_pacbio_raw_files, cmd_format='{EXE} {ARG} > {OUT}')
addraw = PythonFunction(add_pacbio_raw_files, cmd_format='{EXE} {ARG} {OUT}')
addraw(outputs=self.stdout, includes=self.input_files, arguments=[run_dump_path, self.config_reader.get_tmp_directory()])
\ No newline at end of file
......@@ -17,7 +17,7 @@
import os
from workflows.pacbio_qc.lib.pacbiolib import bash5
from workflows.pacbio_qc.lib.pacbiolib import h5file
from jflow.abstraction import MultiMap
......@@ -33,7 +33,6 @@ def h5_to_fastq(input_file, fastqfile):
# generate fastq.gz file
f_out = gzip.open(fastqfile, 'wb')
# copy .bas.h5
reader = PacbioH5Reader(input_file)
for name, description, sequence, qualities in reader :
......@@ -46,7 +45,7 @@ def h5_to_fastq(input_file, fastqfile):
class H5toFastq (Analysis):
def define_parameters(self, input_files):
self.add_input_file_list( "input_files", "Input pacbio bas.h5 files", default=input_files, file_format = bash5, required=True)
self.add_input_file_list( "input_files", "Input pacbio bas.h5 files", default=input_files, file_format = h5file, required=True)
items = [os.path.basename(os.path.splitext(os.path.splitext(filename)[0])[0]) for filename in self.input_files]
self.add_output_file_list( 'output_fastqs', "Output fastq files", pattern="{basename_woext}.fastq.gz", file_format="fastq", items=items)
......
......@@ -19,9 +19,7 @@
import os
import json
from workflows.pacbio_qc.lib.pacbiolib import bash5
from weaver.abstraction import Map
from workflows.pacbio_qc.lib.pacbiolib import h5file
from ng6.analysis import Analysis
......@@ -82,7 +80,7 @@ class RS_Subreads (Analysis):
"""
def define_parameters(self, input_files, min_subreads_length = 50, polymerase_read_qual = 75.0, polymerase_read_length = 50):
self.add_input_file_list( "input_files", "Input pacbio bas.h5 files", default=input_files, file_format = bash5, required=True)
self.add_input_file_list( "input_files", "Input pacbio bas.h5 files", default=input_files, file_format = h5file, required=True)
self.add_parameter("min_subreads_length", "Subreads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=min_subreads_length, type='int')
self.add_parameter("polymerase_read_qual", "Polymerase reads with lower quality than this value are filtered out and excluded from analysis", default=polymerase_read_qual, type='float')
self.add_parameter("polymerase_read_length", "Polymerase reads shorter than this value (in base pairs) are filtered out and excluded from analysis", default=polymerase_read_length, type='int')
......
......@@ -17,7 +17,9 @@
from lightpbcoreio import BasH5Reader
from jflow.seqio import Sequence
import numpy as np
import os
class PacbioH5Reader(object):
"""
......@@ -25,9 +27,30 @@ class PacbioH5Reader(object):
"""
def __init__(self, file):
"""
@param file: pacbio .bas.h5 file
@param file: either the .bas.h5 or one of the three .bax.h5 files
"""
self.bash5 = BasH5Reader(file)
parts = []
analysisresults_dir = os.path.dirname(file)
celldir = os.path.dirname(analysisresults_dir)
if file.endswith(".bax.h5"):
for each in os.listdir(analysisresults_dir) :
fullpath = os.path.join(analysisresults_dir, each)
if each.endswith('.bax.h5') :
parts.append(fullpath)
elif each.endswith('.bas.h5') :
parts = [fullpath]
break
elif file.endswith(".bas.h5"):
parts = [file]
self.bash5 = BasH5Reader(*parts)
self.metadata = None
for meta in os.listdir(celldir) :
metafile = os.path.join(celldir, meta)
if os.path.isfile(metafile) and meta.endswith(".metadata.xml"):
self.metadata = metafile
break
def qvsFromAscii(self,s):
return (np.fromstring(s, dtype=np.uint8) - 33)
......@@ -39,22 +62,14 @@ class PacbioH5Reader(object):
for zmwRead in self.bash5.reads():
yield Sequence(zmwRead.readName, zmwRead.readName, zmwRead.basecalls(), self.asciiFromQvs(zmwRead.QualityValue()))
def bash5(ifile):
try:
hasmetadata = False
celldir = os.path.dirname(os.path.dirname(ifile))
for file in os.listdir(celldir) :
if os.path.isfile(file) and file.endswith(".metadata.xml"):
hasmetadata = True
break
if not hasmetadata :
raise Exception("Wrong directory structure for bas.h5 file "+ ifile)
def h5file(ifile):
try :
reader = PacbioH5Reader(ifile)
nb_seq = 0
for id, desc, seq, qualities in reader:
nb_seq += 1
# only check the first 10 sequences
if nb_seq == 10: break
except:
raise jflow.InvalidFormatError("The provided file '" + ifile + "' is not a valid .bas.h5 file!")
except :
raise jflow.InvalidFormatError("The provided file '" + ifile + "' is not a valid pacbio h5 file!")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment