Skip to content
Snippets Groups Projects
convertPhred64_to_Phred33.py 2.28 KiB
Newer Older
#!/usr/bin/env python3
from Bio import SeqIO
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import gzip
import os,sys

def is_gzip(file_in):
    """
    @return: [bool] True if the file is gziped.
    @param file : [str] Path to processed file.
    """
    is_gzip = None
    handle_input = gzip.open( file_in )
    try:
        handle_input.readline()
        is_gzip = True
    except:
        is_gzip = False
    finally:
        handle_input.close()
    return is_gzip

def checkQualityEncode ( file_in ):
    """
    @summary : check the quality Phred scale. Inspire on https://github.com/brentp/bio-playground/blob/master/reads-utils/guess-encoding.py
    @param input : [str] Path of fastq file
    @return : either "Q33" or "Q64"
    """

    RANGES = {
        'Q33': (33, 74),
        'Q64': (64, 105)
    }
    handle_in=None
    if is_gzip(file_in):
        handle_in = gzip.open(file_in,"rt")
    else:
        handle_in = open(file_in,"rt")
        
    scale = ""
    gmin, gmax  = 99, 0

    for title_line, seq_string, quality_string in FastqGeneralIterator(handle_in):
        qual = [ ord(c) for c in quality_string ]
        qmin = min(qual)
        qmax = max(qual)
        if qmin < gmin or qmax > gmax:
            gmin, gmax = min(qmin, gmin), max(qmax, gmax)
            valid_encodings = []
            for encoding, (emin, emax) in RANGES.items():
                if gmin >= emin and gmax <= emax:
                    valid_encodings.append(encoding)
            if len(valid_encodings) == 1 :
                scale = valid_encodings[0]
                break
    return scale


#~ f_in="/work/project/sigenae/Maria/Sandrine_SNP-ASE/1000_RNASeq/FLLL_livr_F0/tmp.fq.gz"
#~ f_out="tmp.gz"
f_in=snakemake.input.fastq_in
f_out=snakemake.output.fastq_out
encoding = checkQualityEncode(f_in)

if encoding == "Q33":
    print(f_in+" is "+encoding+ " and DO NOT need to be converted\n", file=sys.stderr)
    os.symlink(f_in, f_out)
else:
    try: 
        print(f_in+" is "+encoding+ " and need to be converted\n", file=sys.stderr)
        handle_in = gzip.open(f_in,"rt") if is_gzip(f_in) else open(f_in, "rt")
        handle_out = gzip.open(f_out, "wt")
        SeqIO.convert(handle_in, 'fastq-illumina' , handle_out, 'fastq-sanger')
    except:
        raise Exception("there is no valid encoding found")