Commit 31b218d8 authored by Penom Nom's avatar Penom Nom
Browse files

upgrade sample handling

parent b31ba169
......@@ -1003,8 +1003,15 @@ CREATE TABLE IF NOT EXISTS `tx_nG6_sample` (
`deleted` tinyint(4) NOT NULL DEFAULT '0',
`hidden` tinyint(4) NOT NULL DEFAULT '0',
`run_id` int(11) NOT NULL DEFAULT '0',
`mid` varchar(255) NOT NULL DEFAULT '',
`sample_id` varchar(255) NOT NULL DEFAULT '',
`name` varchar(255) NOT NULL DEFAULT '',
`description` varchar(255) NOT NULL DEFAULT '',
`reads1` text NOT NULL DEFAULT '',
`reads2` text NOT NULL DEFAULT '',
`type` varchar(255) NOT NULL DEFAULT '',
`insert_size` int(11) NOT NULL DEFAULT '0',
`species` varchar(255) NOT NULL DEFAULT '',
`nb_sequence` int(11) NOT NULL DEFAULT '0',
PRIMARY KEY (`uid`),
KEY `parent` (`pid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ;
......
......@@ -330,7 +330,7 @@ class Analysis (Component):
def _create_and_archive(self, files, archive_name=None, prefix="dir"):
"""
return the web path to the archive files
If there are mids, datas will be organised by mids
If there are samples, datas will be organised by samples
@param files : table of file
@param archive_name : the archive name to use, if not set will be
the same as link_value
......@@ -352,14 +352,14 @@ class Analysis (Component):
#If some files have the same name
if len(file_basenames) != len(set(file_basenames)) :
try : # If there are mids
try : # If there are samples
if self.run :
mids_desc = self.run.get_mids_description()
samples = self.run.get_samples()
else :
run_id = self.__get_run_id()
my_run = Run.get_from_id(run_id)
mids_desc = my_run.get_mids_description()
samples = my_run.get_samples()
gfiles = {}
ungrouped_files = []
what_left = []
......@@ -367,16 +367,17 @@ class Analysis (Component):
ungrouped_files.append(file)
what_left.append(file)
# First group files if they have the mid name in their path
# First group files if they have the sample name in their path
for file in files :
# In case of multiple description take the longuest
best_description = []
for mid_desc in mids_desc.keys():
mid_desc_regex = re.compile(".*" + mid_desc + ".*")
mdr = mid_desc_regex.match(file)
if mdr != None:
best_description.append(mid_desc)
for sample in samples :
spname = sample.name
spname_regex = re.compile(".*" + spname + ".*")
spr = spname_regex.match(file)
if spr :
best_description.append(spname)
max_len = 0
final_description = None
......
......@@ -32,7 +32,7 @@ from ng6.run import Run
from ng6.sample import Sample
from ng6.utils import Utils
from workflows.types import adminlogin, casava_dir
from workflows.types import adminlogin, casava_dir, existingproject
class BasicNG6Workflow (Workflow):
......@@ -46,7 +46,7 @@ class BasicNG6Workflow (Workflow):
def get_parameters_per_groups(self, parameters_section="parameters"):
name, description, parameters = self._get_from_config(parameters_section)
parameters.extend(self._get_required_parameters())
parameters.extend(self._required_parameters)
pgparameters, parameters_order = {}, []
for param in parameters:
if param.group not in parameters_order: parameters_order.append(param.group)
......@@ -55,9 +55,6 @@ class BasicNG6Workflow (Workflow):
else:
pgparameters[param.group] = [param]
return [pgparameters, parameters_order]
def _get_required_parameters(self):
return self._required_parameters
def add_component(self, component_name, args=[], kwargs={}, component_prefix="default", parent=None, addto="run"):
# first build and check if this component is OK
......@@ -119,20 +116,20 @@ class NG6Workflow (BasicNG6Workflow):
def __init__(self, args={}, id=None, function= "process", parameters_section="parameters"):
BasicNG6Workflow.__init__(self, args, id, function, parameters_section)
self.samples = []
self._required_parameters.extend(self._add_required_parameters())
self._required_parameters.extend(self._add_run_parameters())
self._required_parameters.extend(self._add_sample_parameters())
self.parameters.extend(self._add_required_parameters())
self.parameters.extend(self._add_run_parameters())
self.parameters.extend(self._add_sample_parameters())
self.args = self._extend_and_format_args(self.parameters, args)
self.__all_reads1__ = []
self.__all_reads2__ = []
self.__samples_name__ = []
self.samples = []
self.reads1 = []
self.reads2 = []
self.samples_names = []
def _add_required_parameters(self):
def _add_run_parameters(self):
orequired = []
orequired.append(Parameter("project_id", "The project id the run belongs to", "--project-id", required=True, type=int, group="Run information"))
orequired.append(Parameter("project_id", "The project id the run belongs to", "--project-id", required=True, type=existingproject, group="Run information"))
orequired.append(Parameter("run_name", "Give a name to your run", "--name", required=True, group="Run information", display_name="Name"))
orequired.append(Parameter("run_description", "Give a description to your run", "--description", required=True, group="Run information", display_name="Description"))
orequired.append(Parameter("run_date", "When were the data produced","--date", required=True, type=date, group="Run information", display_name="Date"))
......@@ -145,51 +142,62 @@ class NG6Workflow (BasicNG6Workflow):
def _add_sample_parameters(self):
osamples = []
subparams = []
subparams.append(Parameter("sample_name", "The name of the sample, MID for 454 data", "sample_name"))
subparams.append(Parameter("sample_id", "The uniq identifier of the sample", "sample_id"))
subparams.append(Parameter("sample_name", "A descriptive name for the sample", "sample_name"))
subparams.append(Parameter("sample_description", "A brief description of the sample", "sample_description" ))
subparams.append(Parameter("read1", "Read 1 data file path", "read1", required=True, type = "localfile", action= "append" ))
subparams.append(Parameter("read2", "Read 2 data file path", "read2", type = "localfile", action="append" ))
subparams.append(Parameter("type", "Read orientation and type", "type", choices = '|'.join(Sample.AVAILABLE_TYPES) ))
subparams.append(Parameter("insert_size", "Insert size for paired end reads", "insert_size", type ='int' ))
subparams.append(Parameter("species", "Species related to this sample", "species"))
subparams.append(Parameter("nb_sequence", "Number of sequences in the sample file", "nb_sequence", type = 'int'))
osamples.append(Parameter("sample", "Definition of a sample", "--sample", action = "append", type="multiple", required = True, sub_parameters = subparams, group = "exclude-sample" ))
return osamples
def __preprocess_samples__(self):
if not (self.args.has_key("sample") and self.args['sample']):
sys.stderr.write( "--sample option is required\n" )
sys.exit()
mids_description = {}
def __create_samples__(self):
for sd in self.args['sample'] :
sp_object = Sample(sd['sample_name'], sd['sample_description'], sd['read1'], sd['read2'])
if sp_object.name and sp_object.description :
mids_description[sp_object.name] = sp_object.description
if sp_object.name :
self.__samples_name__.append(sp_object.name)
self.__all_reads1__.extend(sp_object.reads1)
self.__all_reads2__.extend(sp_object.reads2)
sp_object = Sample( sd['sample_id'], sd['read1'], sd['read2'], name = sd['sample_name'], description = sd['sample_description'], type = sd['type'],
insert_size = sd['insert_size'], species = sd['species'], nb_sequence = sd['nb_sequence'] )
self.reads1.extend(sp_object.reads1)
self.reads2.extend(sp_object.reads2)
self.samples.append(sp_object)
if mids_description :
self.runobj.add_mids_description(mids_description)
def get_samples(self):
return self.samples
def get_all_reads1(self):
return self.__all_reads1__
def __preprocess_samples(self):
samples_ids = []
pidx = 1
nidx = 1
for index, sample in enumerate(self.samples) :
if sample.name :
self.samples_names.append(sample.name)
else :
sample.name = 'SAMPLE_' + str(nidx)
nidx += 1
if sample.sample_id :
if sample.sample_id in samples_ids :
sys.stderr.write( "Sample identifier %s must be uniq ( sample %s ).\n" % sample.sample_id, sample.name )
sys.exit(1)
# ad an automatic id for samples
else :
sample.sample_id = 'sample_' + str(pidx)
pidx += 1
samples_ids.append(sample.sample_id)
def get_all_reads2(self):
return self.__all_reads2__
def get_nospace_runname(self):
return "_".join(self.runobj.name.split())
def get_samples_name(self):
return self.__samples_name__
if len(self.samples_names) != 0 :
if len(self.samples_names) != len (self.samples) :
sys.stderr.write( "All samples must have a defined sample name" )
sys.exit(1)
def get_all_reads(self, type = None):
if type in ['R1', 'r1', 'read1', 1, '1'] :
return self.reads1
elif type in ['R2', 'r2', 'read2', 2, '2'] :
return self.reads2
return self.reads1 + self.reads2
def is_paired_end(self):
return len(self.get_all_reads2()) > 0
return len(self.reads2) > 0
def pre_process(self):
# start from an existing project
......@@ -208,11 +216,10 @@ class NG6Workflow (BasicNG6Workflow):
self.project.add_run(self.runobj)
self.metadata.append("run_id="+str(self.runobj.id))
self.__preprocess_samples__()
if not self.samples :
sys.stderr.write( "Please define at least one sample.\n" )
sys.exit()
self.__create_samples__()
self.__preprocess_samples()
# add samples to run
self.runobj.add_samples(self.samples)
def post_process(self):
# once everything done, sync directories
......@@ -230,10 +237,10 @@ class CasavaNG6Workflow(NG6Workflow):
self.parameters.extend(self._add_casava_parameters())
self.args = self._extend_and_format_args(self.parameters, args)
self.__is_casava__ = False
self.__group_prefix__ = None
self.__undetermined_reads1__ = []
self.__undetermined_reads2__ = []
self.is_casava = False
self.group_prefix = None
self.undetermined_reads1 = []
self.undetermined_reads2 = []
def _add_casava_parameters(self):
params = []
......@@ -243,23 +250,11 @@ class CasavaNG6Workflow(NG6Workflow):
params.append(Parameter("casava", "Definition of a casava directory for sample retrieving", "--casava", required = True, sub_parameters = subparams, type = "multiple", group = "exclude-sample" ))
return params
def __preprocess_samples__(self):
def __create_samples__(self):
if self.args.has_key("casava") and self.args["casava"] and self.args["casava"]['casava_directory'] and self.args["casava"]['lane']:
self.__preprocess_casavadir(self.args["casava"]['casava_directory'], '_'.join( self.project.get_name().split() ), self.args["casava"]['lane'])
else :
NG6Workflow.__preprocess_samples__(self)
def get_all_undetermined_read1(self):
return self.__undetermined_reads1__
def get_all_undetermined_read2(self):
return self.__undetermined_reads1__
def is_casava(self):
return self.__is_casava__
def get_group_prefix(self):
return self.__group_prefix__
NG6Workflow.__create_samples__(self)
def __preprocess_casavadir(self, casava_directory, project_name, lane_number):
"""
......@@ -268,79 +263,66 @@ class CasavaNG6Workflow(NG6Workflow):
@param project_name : files in each sample are part of this project
@param lane_number : files in each sample are sequenced on this lane
"""
self.__is_casava__ = True
samples = []
mids_description = {}
self.is_casava = True
with open(os.path.join(casava_directory, "SampleSheet.mk")) as fh :
barcodes_list = []
sample_ids_list = []
subdirs_list = []
for line in fh :
if line.startswith("l" + lane_number + "_BARCODES"):
parts = line.strip().split(":=")
barcodes_list = parts[1].split(" ")
for i in range(len(barcodes_list)):
samples.append({'barcode':barcodes_list[i]})
# Retrieve samples ids
barcodes_list = parts[1].split()
elif line.startswith("l" + lane_number + "_SAMPLEIDS" ):
parts = line.strip().split(":=")
sample_ids_list = parts[1].split(" ")
for i in range(len(sample_ids_list)):
samples[i]['sample_id'] = sample_ids_list[i]
# Retrieve folder
elif line.startswith("l" + lane_number + "_SUBDIRS"):
parts = line.strip().split(":=")
subdirs_list = parts[1].split(" ")
for i in range(len(subdirs_list)):
samples[i]['subdir'] = subdirs_list[i]
# Filter on project name
aux_samples = []
for sample in samples:
if (re.match("Project_" + project_name + "/Sample_.+", sample['subdir']) is not None) or (sample['subdir'].startswith("Undetermined_indices")):
aux_samples.append(sample)
samples = aux_samples
if len(samples) == 0:
raise ValueError, "The project '" + project_name + "' in lane '" + lane_number + "' doesn't exist in CASAVA directory " + casava_directory
for sample in samples:
# Write line in the index description
if (sample['barcode'] != "NoIndex") and (not sample['subdir'].startswith("Undetermined_indices")) :
mids_description[sample['barcode']] = sample['sample_id']
read1_files = []
read2_files = []
undetermined_read1_files = []
undetermined_read2_files = []
# Write files lists
for file in os.listdir(casava_directory + "/" + sample['subdir']):
if file.endswith(".fastq.gz") and re.search(".*_L00" + lane_number + "_.*", file):
if re.search(".*_R1_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
read1_files.append(casava_directory + "/" + sample['subdir'] + "/" + file)
else:
undetermined_read1_files.append(casava_directory + "/" + sample['subdir'] + "/" + file)
if re.search(".*_R2_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
read2_files.append(casava_directory + "/" + sample['subdir'] + "/" + file)
else:
undetermined_read2_files.append(casava_directory + "/" + sample['subdir'] + "/" + file)
assert len(barcodes_list) == len(sample_ids_list) == len(subdirs_list), "Invalid lane {0} for project {1} in SampleSheet.mk".format(lane_number,project_name)
sp_object = Sample(sample['sample_id'], sample['sample_id'], read1_files, read2_files)
sp_object.add_metadata('undetermined_read1_files', undetermined_read1_files)
sp_object.add_metadata('undetermined_read2_files', undetermined_read2_files)
sp_object.add_metadata('barcode', sample['barcode'])
sp_object.add_metadata('is_casava', True)
if sp_object.name :
self.__samples_name__.append(sp_object.name)
# parse samples
for i in range(len(barcodes_list)):
read1_files = []
read2_files = []
undetermined_read1_files = []
undetermined_read2_files = []
self.__all_reads1__.extend(read1_files)
self.__all_reads2__.extend(read2_files)
self.__undetermined_reads1__.extend(undetermined_read1_files)
self.__undetermined_reads2__.extend(undetermined_read2_files)
self.samples.append(sp_object)
if mids_description :
self.runobj.add_mids_description(mids_description)
sample = {
'barcode' : barcodes_list[i],
'sample_id' : sample_ids_list[i],
'subdir' : subdirs_list[i]
}
# filter on project name
if re.match("Project_" + project_name + "/Sample_.+", sample['subdir']) or sample['subdir'].startswith("Undetermined_indices"):
for file in os.listdir(casava_directory + "/" + sample['subdir']):
filepath = casava_directory + "/" + sample['subdir'] + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + lane_number + "_.*", file):
if re.search(".*_R1_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
read1_files.append(filepath)
else:
undetermined_read1_files.append(filepath)
if re.search(".*_R2_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
read2_files.append(filepath)
else:
undetermined_read2_files.append(filepath)
self.reads1.extend(read1_files)
self.reads2.extend(read2_files)
self.undetermined_reads1.extend(undetermined_read1_files)
self.undetermined_reads2.extend(undetermined_read2_files)
# No sample if undetermined
if not sample['subdir'].startswith("Undetermined_indices") :
sp_object = Sample(sample['barcode'], sample['sample_id'], read1_files, read2_files)
sp_object.add_metadata('undetermined_read1_files', undetermined_read1_files)
sp_object.add_metadata('undetermined_read2_files', undetermined_read2_files)
sp_object.add_metadata('barcode', sample['barcode'])
sp_object.add_metadata('is_casava', True)
self.group_prefix = (Utils.get_group_basenames(self.get_all_reads(), "read")).keys()
self.__group_prefix__ = (Utils.get_group_basenames(self.get_all_reads1()+self.get_all_reads2(), "read")).keys()
\ No newline at end of file
......@@ -64,6 +64,7 @@ class Run(object):
self.description = description
self.sequencer = sequencer
self.__mids_description = None
self.__samples = None
self.raw_files = []
self.admin_login = None
# Set the temp folder to the ng6 temp folder
......@@ -107,38 +108,6 @@ class Run(object):
self.set_full_size(full_size)
self.archive_files(self.raw_files, mode)
def export_mids_to_newbler_cfg_file(self, error=2, midscheme="454MIDS"):
"""
Export the config file in newbler format
the script add_raw_files.
@param files: the files to archive
@param mode: can be none, gz, bz2, tar.gz and tar.bz2
"""
mids_cfg_path = tempfile.NamedTemporaryFile(suffix=".cfg").name
mid_file = open(mids_cfg_path, "w")
mid_file.write(midscheme+"\n")
mid_file.write("{\n")
# MID1:desc_ex1;MID2:desc_ex2;MID3,MID4:
cfg_reader = NG6ConfigReader()
mids = cfg_reader.get_454_mids()
# First write the run mids config file
for mid in self.__mids_description:
for mid_name in mid.split(","):
try:
mid_name = mid_name.strip()
seq = mids[mid_name.lower()].split(",")
seq_val = ''
if len(seq) > 1:
seq_val += '"' + seq[0] + '", ' + str(error) + ', "' + seq[1] + '";'
else:
seq_val += '"' + seq[0] + '", ' + str(error) + ';'
mid_file.write('\tmid = "' + self.__mids_description[mid] + '", ' + seq_val +"\n")
except:
pass
mid_file.write("}")
mid_file.close()
return [mids_cfg_path, midscheme]
def archive_files(self, files, mode, archive_name="ng6_archive.tar", delete=False):
"""
Copy, archive and compress the files list to the run. Files can then be downloaded and gave back by
......@@ -236,7 +205,7 @@ class Run(object):
logging.getLogger("Run.sync").error("Error when synchronizing analysis that belongs to run id=" + str(self.id))
raise Exception(str(err))
# If a sync is required
if len(os.listdir(self.__get_work_directory())):
if os.path.exists(self.__get_work_directory()) and len(os.listdir(self.__get_work_directory())):
# Finally the Run itself
try :
# not created for updates
......@@ -278,32 +247,43 @@ class Run(object):
"""
return self.__get_save_directory()
def get_mids_description(self):
"""
Return a dictionary with mids description
@return : {"MID1":"mid1_desc", "MID2":"mid2_desc", ...}
"""
if self.__mids_description :
return self.__mids_description
def get_samples(self):
'''
return a list of samples object
'''
if self.__samples :
return self.__samples
else :
t3mysql = t3MySQLdb()
return t3mysql.select_run_samples(self.id)
def add_mids_description(self, mids_desc):
"""
Add a mids description to the run
@param mids_desc : the mids description dictionnary
{"MID1":"mid1_desc", "MID2":"mid2_desc", ...}
"""
# First add the description to the python object
self.__mids_description = mids_desc
results = t3mysql.select_run_samples(self.id)
samples = []
for res in results :
samples.append(Sample( res['sample_id'], res['reads1'], reads2 = res['reads2'], name = res['name'], description = res['description'],
type = res['type'], insert_size = res['insert_size'], species = res['species'], nb_sequence = res['nb_sequence'] ))
return samples
if self.id != None:
def add_samples(self, samples):
'''
Add a list of samples to the run
@param samples: the list of samples object to add
'''
self.__samples = samples
if self.id != None :
# Update database
t3mysql = t3MySQLdb()
for mid_name in self.__mids_description.keys():
t3mysql.add_sample_to_run(self.id, mid=mid_name, description=self.__mids_description[mid_name])
for spo in self.__samples:
reads1 = []
reads2 = []
if spo.reads1:
reads1 = [ os.path.basename(ff) for ff in spo.reads1 ]
if spo.reads2 :
reads2 = [ os.path.basename(ff) for ff in spo.reads2 ]
t3mysql.add_sample_to_run(self.id, sample_id = spo.sample_id, reads1 = reads1, reads2 = reads2, name=spo.name,
description = spo.description, type = spo.type, insert_size = spo.insert_size,
species = spo.species, nb_sequence = spo.nb_sequence)
def get_config_file(self):
"""
Return a Run config file
......@@ -370,11 +350,12 @@ class Run(object):
self.id = t3mysql.add_run(self.name, self.date, self.directory, self.species, self.data_nature,
self.type, 0, 0, self.description, self.sequencer, self.admin_login)
# If there is some mids desc provided
if self.__mids_description != None:
for mid_name in self.__mids_description.keys():
t3mysql.add_sample_to_run(self.id, mid=mid_name, description=self.__mids_description[mid_name])
# if there are samples
if self.__samples :
for spo in self.__samples:
t3mysql.add_sample_to_run(self.id, name=spo.name, description=spo.description, reads1 = spo.reads1,
reads2 = spo.reads2, type = spo.type, insert_size = spo.insert_size,
species = spo.species, nb_sequence = spo.nb_sequence)
# Finaly return it's id
return self.id
......
......@@ -18,15 +18,34 @@
class Sample(object):
def __init__(self, name, description, reads1, reads2 = None):
AVAILABLE_TYPES = ["pe", "se", "ose", "ope", "mp"]
def __init__(self, sample_id, reads1, reads2 = None, name = None, description = None, type = None, insert_size = None, species = None, nb_sequence = None ):
self.sample_id = sample_id
self.name = name
self.description = description
self.reads1 = reads1
self.reads2 = reads2
self.insert_size = insert_size
self.nb_sequence = nb_sequence
self.species = species
self.type = type
if isinstance(reads1, str) :
self.reads1 = [reads1]
self.reads2 = reads2
if isinstance(reads2, str) :
self.reads2 = [reads2]
if self.type is None:
if self.reads2 :
self.type = self.AVAILABLE_TYPES[0]
else :
self.type = self.AVAILABLE_TYPES[1]
if self.nb_sequence and isinstance(self.nb_sequence, str) :
self.nb_sequence = int(self.nb_sequence)
self.metadata = {}
def add_metadata(self, key, value):
......@@ -39,3 +58,5 @@ class Sample(object):
def has_metadata(self, key):
return self.metadata.has_key(key)
......@@ -438,21 +438,52 @@ class t3MySQLdb(object):
conn.close()
return analysis_ids
def add_sample_to_run(self, run_id, mid, description):
def add_sample_to_run(self, run_id, sample_id, reads1, reads2 = None, name = None , description = None,
type = None, insert_size = None, species = None, nb_sequence = None):
"""