Commit 992a5b3b authored by Audrey Gibert's avatar Audrey Gibert
Browse files

This commit resolves :

issue #97 
issue #96 which merging demand has not been accepted yet
parent b8ff28c8
...@@ -21,11 +21,11 @@ import logging ...@@ -21,11 +21,11 @@ import logging
from jflow.component import Component from jflow.component import Component
def extract_stats_from_seq_file(input_file,output_file_stat): def extract_stats_from_seq_file(input_file, output_file_stat):
import jflow.seqio as seqio import jflow.seqio as seqio
import logging import logging
import os import os
logging.getLogger("AddRawFiles").debug("extract_stats_from_seq_files. Entering, working on "+input_file) logging.getLogger("AddRawFiles").debug("extract_stats_from_seq_files. Entering, working on " + input_file)
nb_seq, full_size = 0, 0 nb_seq, full_size = 0, 0
try: try:
# Get nb_seq and full_size values # Get nb_seq and full_size values
...@@ -41,25 +41,33 @@ def extract_stats_from_seq_file(input_file,output_file_stat): ...@@ -41,25 +41,33 @@ def extract_stats_from_seq_file(input_file,output_file_stat):
logging.getLogger("AddRawFiles").debug("extract_stats_from_seq_files. finished") logging.getLogger("AddRawFiles").debug("extract_stats_from_seq_files. finished")
def md5sum(md5_file_out, *files_to_md5sum): def md5sum(md5_file_out, files_to_md5sum):
import jflow.seqio as seqio import jflow.seqio as seqio
import logging import logging
import os import os
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
logging.getLogger("AddRawFiles").debug("md5sum. entering") logging.getLogger("AddRawFiles").debug("md5sum. entering")
logging.getLogger("AddRawFiles").debug("md5sum. files_to_md5sum in "+os.path.dirname(md5_file_out)+" = " + ",".join(files_to_md5sum)) logging.getLogger("AddRawFiles").debug("md5sum. md5sum in this directory : "+os.path.dirname(md5_file_out))
p = Popen("md5sum " + " ".join(files_to_md5sum) +"|awk -F/ {'print $1,$NF'} > "+ md5_file_out ,shell=True,stdout=PIPE,stderr=PIPE,universal_newlines=True) logging.getLogger("AddRawFiles").debug("md5sum. Generating the md5sum.txt with this command : " +
"find " + os.path.dirname(files_to_md5sum) + " -maxdepth 1 -type f -not -name " + os.path.basename(md5_file_out) +
" -exec md5sum {} \; | awk -F/ {'print $1$NF'} > "+ md5_file_out)
p = Popen("find " + os.path.dirname(files_to_md5sum) + " -maxdepth 1 -type f -not -name " + os.path.basename(md5_file_out) + " -exec md5sum {} \;" +
"| awk -F/ {'print $1$NF'} > "+ md5_file_out, shell = True, stdout = PIPE, stderr = PIPE, universal_newlines = True)
stdout,stderr = p.communicate() stdout,stderr = p.communicate()
logging.getLogger("AddRawFiles").debug("md5sum. finished") logging.getLogger("AddRawFiles").debug("md5sum. finished")
def add_stats_to_run (run_dump_path,*files): def add_stats_to_run (run_dump_path,file):
import pickle import pickle
import logging import logging
total_nb_seq = 0 total_nb_seq = 0
total_size = 0 total_size = 0
for curr_stat_file in files: logging.getLogger("AddRawFiles").debug("add_stats_to_run. Working on files of the directory containing this one : " + file)
logging.getLogger("AddRawFiles").debug("add_stats_to_run. Content of " + curr_stat_file) file_list = [ f for f in os.listdir(os.path.dirname(file)) if f.endswith('.count')]
with open(curr_stat_file, 'r') as myfile: logging.getLogger("AddRawFiles").debug("add_stats_to_run. Length on the file_list : " + str(len(file_list)))
logging.getLogger("AddRawFiles").debug("add_stats_to_run. Files on the file_list : " + str(file_list))
for curr_stat_file in file_list:
logging.getLogger("AddRawFiles").debug("add_stats_to_run. Content of " + os.path.dirname(file) + "/" + curr_stat_file)
with open(os.path.dirname(file) + "/" + curr_stat_file, 'r') as myfile:
nb_seq,size=myfile.read().split(':') nb_seq,size=myfile.read().split(':')
logging.getLogger("AddRawFiles").debug("add_stats_to_run. Content of " + curr_stat_file + " nb_seq = " + str(nb_seq) + ", size = " + str(size)) logging.getLogger("AddRawFiles").debug("add_stats_to_run. Content of " + curr_stat_file + " nb_seq = " + str(nb_seq) + ", size = " + str(size))
size= int(size) size= int(size)
...@@ -149,14 +157,15 @@ class AddRawFiles (Component): ...@@ -149,14 +157,15 @@ class AddRawFiles (Component):
files_to_save_stats = self.get_outputs( '{basename_woext}.count', self.files_to_save) files_to_save_stats = self.get_outputs( '{basename_woext}.count', self.files_to_save)
logging.getLogger("AddRawFiles").debug("process. Before self.add_python_execution(extract_stats_from_seq_file") logging.getLogger("AddRawFiles").debug("process. Before self.add_python_execution(extract_stats_from_seq_file")
#count number of reads and total length in base for each seq file #count number of reads and total length in base for each seq file
for i,o in zip(self.files_to_save,files_to_save_stats ): for i,o in zip(self.files_to_save, files_to_save_stats ):
self.add_python_execution(extract_stats_from_seq_file,cmd_format="{EXE} {IN} {OUT}", self.add_python_execution(extract_stats_from_seq_file,cmd_format="{EXE} {IN} {OUT}",
inputs = i, outputs = o, map=False) inputs = i, outputs = o, map=False)
logging.getLogger("AddRawFiles").debug("process. Before self.add_stats_to_run(extract_stats_from_seq_file") logging.getLogger("AddRawFiles").debug("process. Before self.add_stats_to_run(extract_stats_from_seq_file")
logging.getLogger("AddRawFiles").debug("process. Dirname of files_to_save_stats[0] : "+ os.path.dirname(files_to_save_stats[0]))
#Add number of reads and total length in base for each seq file and add these data to the run in the database #Add number of reads and total length in base for each seq file and add these data to the run in the database
self.add_python_execution(add_stats_to_run, cmd_format='{EXE} {ARG} {IN} > {OUT}', map=False, self.add_python_execution(add_stats_to_run, cmd_format='{EXE} {ARG} {IN} > {OUT}', map=False,
outputs=self.stdout, inputs=files_to_save_stats,includes=self.files_to_save, arguments=[run_dump_path]) outputs = self.stdout, inputs = files_to_save_stats[0], includes=self.files_to_save, arguments=[run_dump_path])
#archive the files in the work folder of the run to be rsynced at the end #archive the files in the work folder of the run to be rsynced at the end
logging.getLogger("AddRawFiles").debug("process. Before copying/archiving files with compression = " + self.compression ) logging.getLogger("AddRawFiles").debug("process. Before copying/archiving files with compression = " + self.compression )
...@@ -173,7 +182,7 @@ class AddRawFiles (Component): ...@@ -173,7 +182,7 @@ class AddRawFiles (Component):
for idx, file in enumerate(files_to_sync): for idx, file in enumerate(files_to_sync):
self.add_python_execution(copy_file,cmd_format="{EXE} {IN} {OUT}", self.add_python_execution(copy_file,cmd_format="{EXE} {IN} {OUT}",
inputs = self.files_to_save[idx], outputs = file, map=False) inputs = self.files_to_save[idx], outputs = file, map=False)
#TODO possible inconsistancy : if the filename ends with one extension in Utils.UNCOMPRESS_EXTENSION, output file name won't be suffixed with gz in Utils.gzip #TODO: possible inconsistancy : if the filename ends with one extension in Utils.UNCOMPRESS_EXTENSION, output file name won't be suffixed with gz in Utils.gzip
elif self.compression=="gz": elif self.compression=="gz":
for file in self.files_to_save: for file in self.files_to_save:
files_to_sync_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file)+".gz")) files_to_sync_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file)+".gz"))
...@@ -183,7 +192,7 @@ class AddRawFiles (Component): ...@@ -183,7 +192,7 @@ class AddRawFiles (Component):
for idx, file in enumerate(self.files_to_save): for idx, file in enumerate(self.files_to_save):
self.add_python_execution(zip_file,cmd_format="{EXE} {IN} {OUT}", self.add_python_execution(zip_file,cmd_format="{EXE} {IN} {OUT}",
inputs = file, outputs = files_to_sync[idx], map=False) inputs = file, outputs = files_to_sync[idx], map=False)
#TODO possible inconsistancy : if the filename ends with one extension in Utils.UNCOMPRESS_EXTENSION, output file name won't be suffixed with bz2 in Utils.bz2 #TODO: possible inconsistancy : if the filename ends with one extension in Utils.UNCOMPRESS_EXTENSION, output file name won't be suffixed with bz2 in Utils.bz2
elif self.compression=="bz2": elif self.compression=="bz2":
for file in self.files_to_save: for file in self.files_to_save:
files_to_md5_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file))) files_to_md5_ori.append(os.path.join(self.runobj.get_work_directory(),os.path.basename(file)))
...@@ -213,8 +222,7 @@ class AddRawFiles (Component): ...@@ -213,8 +222,7 @@ class AddRawFiles (Component):
if (len(files_to_md5_ori)>0): if (len(files_to_md5_ori)>0):
md5_file = os.path.join(self.runobj.get_work_directory(), "md5sum.txt") md5_file = os.path.join(self.runobj.get_work_directory(), "md5sum.txt")
self.add_python_execution(md5sum,cmd_format="{EXE} {OUT} {IN}", self.add_python_execution(md5sum, cmd_format="{EXE} {OUT} {IN}",
inputs = files_to_md5, outputs = md5_file, map=False) inputs = files_to_md5[0], outputs = md5_file, map=False)
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment