trimgalore_summary.py

#!/usr/bin/env python3
import os
import re
from collections import OrderedDict

def build_single_lines(file,res_dic):
    
    sample_single.append(re.search(r"(.+)_trim[SP]e.log", os.path.basename(file)).group(1))
    with open(file, "r") as log_file:
        line = log_file.readline()
    
        # forward reads
        while line != "=== Summary ===\n":
            line = log_file.readline()
        line = log_file.readline().strip()
        while not line.startswith("==="):
            match = re.match(r"(.+):\s*(.+)", line)
            if match:
                head = match.group(1)
                head = head.replace("Total", "Total forward")
                head = head.replace("Reads", "Forward Reads")
                head = head.replace("Quality", "Forward quality")
                value = match.group(2)
                value = re.sub("\(.+\)", "", value)
                value = value.replace(" bp", "").replace(",", "")
                if head not in res_dic:
                    res_dic[head] = []
                res_dic[head].append(value)
            line = log_file.readline().strip()
        
        # final length filter
        while not "shorter than the length cutoff" in line :
            line = log_file.readline()
        
        head = "too short reads (pairs) removed"
        value = line.split(":")[1].split("(")[0].strip()
        if head not in res_dic:
            res_dic[head] = []
        res_dic[head].append(value)
        
def build_paired_lines(file,res_dic):
    
    sample_paired.append(re.search(r"(.+)_trim[SP]e.log", os.path.basename(file)).group(1))
    with open(file, "r") as log_file:
        line = log_file.readline()
    
        # forward reads
        while line != "=== Summary ===\n":
            line = log_file.readline()
        line = log_file.readline().strip()
        while not line.startswith("==="):
            match = re.match(r"(.+):\s*(.+)", line)
            if match:
                head = match.group(1)
                head = head.replace("Total", "Total forward")
                head = head.replace("Reads", "Forward Reads")
                head = head.replace("Quality", "Forward quality")
                value = match.group(2)
                value = re.sub("\(.+\)", "", value)
                value = value.replace(" bp", "").replace(",", "")
                if head not in res_dic:
                    res_dic[head] = []
                res_dic[head].append(value)
            line = log_file.readline().strip()
        
        # reverse reads
        while line != "=== Summary ===\n":
            line = log_file.readline()
        line = log_file.readline().strip()
        while not line.startswith("==="):
            match = re.match(r"(.+):\s*(.+)", line)
            if match:
                head = match.group(1)
                head = head.replace("Total", "Total reverse")
                head = head.replace("Reads", "Reverse Reads")
                head = head.replace("Quality", "Reverse quality")
                value = match.group(2)
                value = re.sub("\(.+\)", "", value)
                value = value.replace(" bp", "").replace(",", "")
                if head not in res_dic:
                    res_dic[head] = []
                res_dic[head].append(value)
            line = log_file.readline().strip()
            
        while not "shorter than the length cutoff" in line :
            line = log_file.readline()
        
        head = "too short reads (pairs) removed"
        value = line.split(":")[1].split("(")[0].strip()
        if head not in res_dic:
            res_dic[head] = []
        res_dic[head].append(value)


def build_lines(file):
    with open(file, "r") as log_file:
        line = log_file.readline()
        while not line.startswith("Trimming mode:"):
            line = log_file.readline()
        # either paired-end or single-end
        mode = line.split(":")[1].strip()
        
        if mode == "paired-end":
            build_paired_lines(file,res_paired_lines)
        else:
            build_single_lines(file,res_single_lines)
            

# parse logs
sample_paired = []
sample_single = []
res_paired_lines = OrderedDict()
res_single_lines = OrderedDict()

if isinstance(snakemake.input.logs, list) :
    for log in snakemake.input.logs:
        build_lines(log)
else:
    build_lines(snakemake.input.logs)

# merge in one dict        
sample_names = sample_paired + sample_single
res_lines = OrderedDict()

if len(res_paired_lines) > 0:
    for head in res_paired_lines:
        if head in res_single_lines:
            res_lines[head] = res_paired_lines[head] + res_single_lines[head]
        else:
            res_lines[head] = res_paired_lines[head] + ['0']*len(sample_single)
else:
    for head in res_single_lines:
        res_lines[head] = res_single_lines[head]

# write results
with open(snakemake.output.out, "w") as summary_file:
    summary_file.write("\t" + "\t".join(sample_names) + "\n")
    for head, value in res_lines.items():
        summary_file.write(head + "\t" + "\t".join(value) + "\n")