Newer
Older
mariabernard
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
import os
import re
from collections import OrderedDict
def build_single_lines(file,res_dic):
sample_single.append(re.search(r"(.+)_trim[SP]e.log", os.path.basename(file)).group(1))
with open(file, "r") as log_file:
line = log_file.readline()
# forward reads
while line != "=== Summary ===\n":
line = log_file.readline()
line = log_file.readline().strip()
while not line.startswith("==="):
match = re.match(r"(.+):\s*(.+)", line)
if match:
head = match.group(1)
head = head.replace("Total", "Total forward")
head = head.replace("Reads", "Forward Reads")
head = head.replace("Quality", "Forward quality")
value = match.group(2)
value = re.sub("\(.+\)", "", value)
value = value.replace(" bp", "").replace(",", "")
if head not in res_dic:
res_dic[head] = []
res_dic[head].append(value)
line = log_file.readline().strip()
# final length filter
while not "shorter than the length cutoff" in line :
line = log_file.readline()
head = "too short reads (pairs) removed"
value = line.split(":")[1].split("(")[0].strip()
if head not in res_dic:
res_dic[head] = []
res_dic[head].append(value)
def build_paired_lines(file,res_dic):
sample_paired.append(re.search(r"(.+)_trim[SP]e.log", os.path.basename(file)).group(1))
with open(file, "r") as log_file:
line = log_file.readline()
# forward reads
while line != "=== Summary ===\n":
line = log_file.readline()
line = log_file.readline().strip()
while not line.startswith("==="):
match = re.match(r"(.+):\s*(.+)", line)
if match:
head = match.group(1)
head = head.replace("Total", "Total forward")
head = head.replace("Reads", "Forward Reads")
head = head.replace("Quality", "Forward quality")
value = match.group(2)
value = re.sub("\(.+\)", "", value)
value = value.replace(" bp", "").replace(",", "")
if head not in res_dic:
res_dic[head] = []
res_dic[head].append(value)
line = log_file.readline().strip()
# reverse reads
while line != "=== Summary ===\n":
line = log_file.readline()
line = log_file.readline().strip()
while not line.startswith("==="):
match = re.match(r"(.+):\s*(.+)", line)
if match:
head = match.group(1)
head = head.replace("Total", "Total reverse")
head = head.replace("Reads", "Reverse Reads")
head = head.replace("Quality", "Reverse quality")
value = match.group(2)
value = re.sub("\(.+\)", "", value)
value = value.replace(" bp", "").replace(",", "")
if head not in res_dic:
res_dic[head] = []
res_dic[head].append(value)
line = log_file.readline().strip()
while not "shorter than the length cutoff" in line :
line = log_file.readline()
head = "too short reads (pairs) removed"
value = line.split(":")[1].split("(")[0].strip()
if head not in res_dic:
res_dic[head] = []
res_dic[head].append(value)
def build_lines(file):
with open(file, "r") as log_file:
line = log_file.readline()
while not line.startswith("Trimming mode:"):
line = log_file.readline()
# either paired-end or single-end
mode = line.split(":")[1].strip()
if mode == "paired-end":
build_paired_lines(file,res_paired_lines)
else:
build_single_lines(file,res_single_lines)
# parse logs
sample_paired = []
sample_single = []
res_paired_lines = OrderedDict()
res_single_lines = OrderedDict()
if isinstance(snakemake.input.logs, list) :
for log in snakemake.input.logs:
build_lines(log)
else:
build_lines(snakemake.input.logs)
# merge in one dict
sample_names = sample_paired + sample_single
res_lines = OrderedDict()
if len(res_paired_lines) > 0:
for head in res_paired_lines:
if head in res_single_lines:
res_lines[head] = res_paired_lines[head] + res_single_lines[head]
else:
res_lines[head] = res_paired_lines[head] + ['0']*len(sample_single)
else:
for head in res_single_lines:
res_lines[head] = res_single_lines[head]
# write results
with open(snakemake.output.out, "w") as summary_file:
summary_file.write("\t" + "\t".join(sample_names) + "\n")
for head, value in res_lines.items():
summary_file.write(head + "\t" + "\t".join(value) + "\n")