Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add capability to parse and check user-given bases mask (demux_reads … #7

Merged
merged 1 commit into from
Feb 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bandit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
skips: ['B101']
19 changes: 15 additions & 4 deletions digestiflow_demux/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ from digestiflow_demux.snakemake_support import (
wrapper_path,
)

from digestiflow_demux.bases_mask import return_bases_mask

# -------------------------------------------------------------------------------------------------
# Helper Functions

Expand All @@ -18,8 +20,16 @@ def out_prefix(path):
return os.path.join(config["output_dir"], path)


def demux_reads(flowcell):
return flowcell.get("demux_reads", flowcell["planned_reads"])
def demux_reads(flowcell, demux_tool):
demux_reads = flowcell.get("demux_reads") or flowcell["planned_reads"]
planned_reads = flowcell["planned_reads"]

if demux_tool == "bcl2fastq" and demux_reads == planned_reads:
return ""
elif demux_tool == "bcl2fastq":
return return_bases_mask(planned_reads, demux_reads)
else:
return return_bases_mask(planned_reads, demux_reads, "picard")

# -------------------------------------------------------------------------------------------------
# Define local rules.
Expand Down Expand Up @@ -63,6 +73,7 @@ rule demux:
flowcell_token=config["flowcell"]["vendor_id"],
input_dir=config["input_dir"],
output_dir=config["output_dir"],
read_structure=demux_reads(config["flowcell"], config["demux_tool"]),
tiles_arg=get_tiles_arg(config),
threads: config["cores"]
wrapper: wrapper_path(bcl2fastq_wrapper(config))
Expand All @@ -80,7 +91,7 @@ rule demux_picard:
machine_name=config["flowcell"]["sequencing_machine"],
flowcell_token=config["flowcell"]["vendor_id"],
run_number=config["flowcell"]["run_number"],
read_structure=demux_reads(config["flowcell"]),
read_structure=demux_reads(config["flowcell"], config["demux_tool"]),
input_dir=config["input_dir"],
output_dir=config["output_dir"],
tiles_arg=get_tiles_arg(config),
Expand All @@ -93,7 +104,7 @@ rule prepare_picard:
output: out_prefix("picard_barcodes/{lane}/metrics.txt")
params:
flowcell_token=config["flowcell"]["vendor_id"],
read_structure=demux_reads(config["flowcell"]),
read_structure=demux_reads(config["flowcell"], config["demux_tool"]),
input_dir=config["input_dir"],
threads: config["cores"]
wrapper: wrapper_path("picard/extract_barcodes")
Expand Down
94 changes: 94 additions & 0 deletions digestiflow_demux/bases_mask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
class BaseMaskConfigException(Exception):
"""Raise if base mask for demux_reads is malformed"""

pass


def split_bases_mask(bases_mask):
"""Parse a picard style bases mask and return a list of tuples
>>> split_bases_mask("100T8B8B100T")
[('T', 100), ('B', 8), ('B', 8), ('T', 100)]
"""
splitat = []
for i, c in enumerate(bases_mask):
if c.isalpha():
splitat.append(i)
holtgrewe marked this conversation as resolved.
Show resolved Hide resolved

# Check that mask is well-behaved
if splitat[0] == 0:
raise BaseMaskConfigException("Mask must start with number of cycles, not type")
# Check that no letters appear next to each other
diffs = []
for i in range(len(splitat) - 1):
diffs.append(splitat[i + 1] - splitat[i])
if (0 in diffs) or (1 in diffs):
raise BaseMaskConfigException("Type characters must be separated by a number (of cycles)")

result = []
num = ""
for i, _character in enumerate(bases_mask):
if i not in splitat:
num += bases_mask[i]
elif int(num) == 0:
pass
else:
result.append((bases_mask[i].upper(), int(num)))
num = ""

return result


def compare_bases_mask(planned_reads, bases_mask):
"""Match user input bases mask to planned_reads from flowcell and decide if compatible.
Return list of lists of tuples with type and number of cycles"""

planned = split_bases_mask(planned_reads)
mask = split_bases_mask(bases_mask)

lengths1 = [count for _type, count in planned]
lengths2 = [count for _type, count in mask]
if not sum(lengths1) == sum(lengths2):
raise BaseMaskConfigException("Your base mask has more or fewer cycles than planned")

matched_mask = []
for _type, cycles in planned:
read = []
s = 0
while s < cycles:
i = mask.pop(0)
read.append(i)
s += i[1]
if s > cycles:
raise BaseMaskConfigException(
"Your base mask has more or fewer cycles than planned for a read"
)
matched_mask.append(read)

return matched_mask


def translate_tuple_to_basemask(tup, demux_tool):
"""Return illumina or picard-style base mask string"""

picard_to_illumina = {"T": "y", "S": "n", "B": "I"}

if demux_tool == "bcl2fastq":
return picard_to_illumina[tup[0]] + str(tup[1])
else:
return str(tup[1]) + tup[0]


def return_bases_mask(planned_reads, demux_reads, demux_tool="bcl2fastq"):
"""Parse planned_reads and demux_reads (user-configured base mask), compare for compatiblity
and return a string to either give to bcl2fastq or picard"""

if "M" in demux_reads and demux_tool == "bcl2fastq":
raise BaseMaskConfigException("You cannot assign UMIs ('M') if using bcl2fastq")

mask_list = compare_bases_mask(planned_reads, demux_reads)

new_mask = []
for lst in mask_list:
substr = [translate_tuple_to_basemask(t, demux_tool) for t in lst]
new_mask.append("".join(substr))
return ",".join(new_mask) if demux_tool == "bcl2fastq" else "".join(new_mask)
3 changes: 3 additions & 0 deletions digestiflow_demux/snakemake_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import functools
import os

from bases_mask import return_bases_mask

__author__ = "Manuel Holtgrewe <manuel.holtgrewe@bihealth.de>"


Expand Down Expand Up @@ -121,6 +123,7 @@ def out_prefix(path):

flowcell = config["flowcell"]
demux_reads = flowcell.get("demux_reads") or flowcell["planned_reads"]
demux_reads = return_bases_mask(flowcell["planned_reads"], demux_reads, "picard")
is_paired = demux_reads.count("T") > 1
sample_map = build_sample_map(flowcell)

Expand Down
Empty file.
113 changes: 113 additions & 0 deletions digestiflow_demux/tests/test_bases_mask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import pytest

from ..bases_mask import (
split_bases_mask,
compare_bases_mask,
return_bases_mask,
BaseMaskConfigException,
)


def test_parse0():
bases_mask1 = "100t8B8b100T"
x = split_bases_mask(bases_mask1)
assert [("T", 100), ("B", 8), ("B", 8), ("T", 100)] == x


def test_parse1():
bases_mask1 = "100T8B8B100T"
x = split_bases_mask(bases_mask1)
assert [("T", 100), ("B", 8), ("B", 8), ("T", 100)] == x


def test_parse2():
bases_mask1 = "100T8B7B1S100T"
x = split_bases_mask(bases_mask1)
assert [("T", 100), ("B", 8), ("B", 7), ("S", 1), ("T", 100)] == x


def test_parse3():
bases_mask1 = "100T8B0B100T"
x = split_bases_mask(bases_mask1)
assert [("T", 100), ("B", 8), ("T", 100)] == x


def test_parse4():
bases_mask1 = "100T8B00B100T"
x = split_bases_mask(bases_mask1)
assert [("T", 100), ("B", 8), ("T", 100)] == x


def test_parse5():
bases_mask1 = "T8B00B100T"
with pytest.raises(BaseMaskConfigException):
split_bases_mask(bases_mask1)


def test_parse6():
bases_mask1 = "100TT8B10B100T"
with pytest.raises(BaseMaskConfigException):
split_bases_mask(bases_mask1)


def test_compatibility0():
"""demux_reads and planned_reads are the same"""
planned_reads = "100T8B8B100T"
user_mask = "100T8B8B100T"
assert compare_bases_mask(planned_reads, user_mask)


def test_compatibility1():
"""demux_reads is compatible with planned reads"""
planned_reads = "100T8B8B100T"
user_mask = "100T8B7B1S100T"
assert compare_bases_mask(planned_reads, user_mask)


def test_compatibility2():
planned_reads = "100T8B8B100T"
user_mask = "90T10S8B8B100T"
assert compare_bases_mask(planned_reads, user_mask)


def test_compatibility3():
"""Total lengths differ"""
planned_reads = "100T8B8B100T"
user_mask = "100T8B10B100T"
with pytest.raises(BaseMaskConfigException):
compare_bases_mask(planned_reads, user_mask)


def test_case1():
x = return_bases_mask("100T8B", "100T8B")
assert x == "y100,I8"


def test_case1_picard():
x = return_bases_mask("100T8B", "100T8B", "picard")
assert x == "100T8B"


def test_case2():
x = return_bases_mask("100T8B", "90T10S8B")
assert x == "y90n10,I8"


def test_case2_picard():
x = return_bases_mask("100T8B", "90T10S8B", "picard")
assert x == "90T10S8B"


def test_case3():
x = return_bases_mask("100T8B", "90T10S8B")
assert x == "y90n10,I8"


def test_case3_picard():
x = return_bases_mask("100T8B", "90T10S8B", "picard")
assert x == "90T10S8B"


def test_case4():
x = return_bases_mask("100T8B8B100T", "90T10S8B8B100T")
assert x == "y90n10,I8,I8,y100"
2 changes: 1 addition & 1 deletion digestiflow_demux/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def write_sample_sheet_v1(writer, flowcell, libraries):
]
writer.writerow(header)

demux_reads = flowcell.get("demux_reads", flowcell["planned_reads"])
demux_reads = flowcell.get("demux_reads") or flowcell["planned_reads"]
recipe = "PE_indexing" if demux_reads.count("T") > 1 else "SE_indexing"
for lib in libraries:
if lib["barcode2"]:
Expand Down
7 changes: 6 additions & 1 deletion digestiflow_demux/wrappers/bcl2fastq2/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
# More than 8 threads will not work for bcl2fastq.
bcl2fastq_threads = min(8, snakemake.config["cores"]) # noqa

if snakemake.params.read_structure: # noqa
bases_mask = "--use-bases-mask " + snakemake.params.read_structure # noqa
else:
bases_mask = ""

shell(
r"""
set -euo pipefail
Expand All @@ -43,7 +48,7 @@
--output-dir $TMPDIR/demux_out \
--interop-dir $TMPDIR/interop_dir \
--processing-threads {bcl2fastq_threads} \
{snakemake.params.tiles_arg}
{bases_mask} {snakemake.params.tiles_arg}

tree $TMPDIR/demux_out

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@

head -n 1000 $sheet

picard IlluminaBasecallsToFastq \
picard -Xmx16g \
IlluminaBasecallsToFastq \
BASECALLS_DIR={snakemake.params.input_dir}/Data/Intensities/BaseCalls \
READ_STRUCTURE={snakemake.params.read_structure} \
LANE=$lane \
Expand Down