#!/usr/bin/env python3 from Bio import SeqIO from Bio.SeqIO.QualityIO import FastqGeneralIterator import gzip import os,sys def is_gzip(file_in): """ @return: [bool] True if the file is gziped. @param file : [str] Path to processed file. """ is_gzip = None handle_input = gzip.open( file_in ) try: handle_input.readline() is_gzip = True except: is_gzip = False finally: handle_input.close() return is_gzip def checkQualityEncode ( file_in ): """ @summary : check the quality Phred scale. Inspire on https://github.com/brentp/bio-playground/blob/master/reads-utils/guess-encoding.py @param input : [str] Path of fastq file @return : either "Q33" or "Q64" """ RANGES = { 'Q33': (33, 74), 'Q64': (64, 105) } handle_in=None if is_gzip(file_in): handle_in = gzip.open(file_in,"rt") else: handle_in = open(file_in,"rt") scale = "" gmin, gmax = 99, 0 for title_line, seq_string, quality_string in FastqGeneralIterator(handle_in): qual = [ ord(c) for c in quality_string ] qmin = min(qual) qmax = max(qual) if qmin < gmin or qmax > gmax: gmin, gmax = min(qmin, gmin), max(qmax, gmax) valid_encodings = [] for encoding, (emin, emax) in RANGES.items(): if gmin >= emin and gmax <= emax: valid_encodings.append(encoding) if len(valid_encodings) == 1 : scale = valid_encodings[0] break return scale #~ f_in="/work/project/sigenae/Maria/Sandrine_SNP-ASE/1000_RNASeq/FLLL_livr_F0/tmp.fq.gz" #~ f_out="tmp.gz" f_in=snakemake.input.fastq_in f_out=snakemake.output.fastq_out encoding = checkQualityEncode(f_in) if encoding == "Q33": print(f_in+" is "+encoding+ " and DO NOT need to be converted\n", file=sys.stderr) os.symlink(f_in, f_out) else: try: print(f_in+" is "+encoding+ " and need to be converted\n", file=sys.stderr) handle_in = gzip.open(f_in,"rt") if is_gzip(f_in) else open(f_in, "rt") handle_out = gzip.open(f_out, "wt") SeqIO.convert(handle_in, 'fastq-illumina' , handle_out, 'fastq-sanger') except: raise Exception("there is no valid encoding found")