Newer
Older
mariabernard
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
from Bio import SeqIO
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import gzip
import os,sys
def is_gzip(file_in):
"""
@return: [bool] True if the file is gziped.
@param file : [str] Path to processed file.
"""
is_gzip = None
handle_input = gzip.open( file_in )
try:
handle_input.readline()
is_gzip = True
except:
is_gzip = False
finally:
handle_input.close()
return is_gzip
def checkQualityEncode ( file_in ):
"""
@summary : check the quality Phred scale. Inspire on https://github.com/brentp/bio-playground/blob/master/reads-utils/guess-encoding.py
@param input : [str] Path of fastq file
@return : either "Q33" or "Q64"
"""
RANGES = {
'Q33': (33, 74),
'Q64': (64, 105)
}
handle_in=None
if is_gzip(file_in):
handle_in = gzip.open(file_in,"rt")
else:
handle_in = open(file_in,"rt")
scale = ""
gmin, gmax = 99, 0
for title_line, seq_string, quality_string in FastqGeneralIterator(handle_in):
qual = [ ord(c) for c in quality_string ]
qmin = min(qual)
qmax = max(qual)
if qmin < gmin or qmax > gmax:
gmin, gmax = min(qmin, gmin), max(qmax, gmax)
valid_encodings = []
for encoding, (emin, emax) in RANGES.items():
if gmin >= emin and gmax <= emax:
valid_encodings.append(encoding)
if len(valid_encodings) == 1 :
scale = valid_encodings[0]
break
return scale
#~ f_in="/work/project/sigenae/Maria/Sandrine_SNP-ASE/1000_RNASeq/FLLL_livr_F0/tmp.fq.gz"
#~ f_out="tmp.gz"
f_in=snakemake.input.fastq_in
f_out=snakemake.output.fastq_out
encoding = checkQualityEncode(f_in)
if encoding == "Q33":
print(f_in+" is "+encoding+ " and DO NOT need to be converted\n", file=sys.stderr)
os.symlink(f_in, f_out)
else:
try:
print(f_in+" is "+encoding+ " and need to be converted\n", file=sys.stderr)
handle_in = gzip.open(f_in,"rt") if is_gzip(f_in) else open(f_in, "rt")
handle_out = gzip.open(f_out, "wt")
SeqIO.convert(handle_in, 'fastq-illumina' , handle_out, 'fastq-sanger')
except:
raise Exception("there is no valid encoding found")