Skip to content

Commit

Permalink
Merge pull request #93 from agalitsyna/origin/parse_all
Browse files Browse the repository at this point in the history
Junction index update
  • Loading branch information
golobor committed Feb 23, 2021
2 parents d9e5ed2 + a3cd18e commit 5d0c9cf
Show file tree
Hide file tree
Showing 8 changed files with 330 additions and 346 deletions.
106 changes: 49 additions & 57 deletions doc/_static/rescue_modes.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
296 changes: 143 additions & 153 deletions doc/_static/rescue_modes_readthrough.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 11 additions & 6 deletions doc/parsing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,17 @@ reads read through the same ligation junction. However, these cases are successf

Reporing complex walks in case of readthrough

To restore the sequence of ligation events, there is a special field ``chimera_index`` that can be reported as
a separate column of .pair file by setting ``--add-columns chimera_index``. This field contains information on:
- alignment type (unique, multimapped, unmapped)
- the order of alignment in forward read, if present (0, if not),
- the order of alignment in reverse read, if present (0, if not),
and has the general form of "Uf1r0". With this information, the whole sequence of ligation events can be restored from the .pair file.
To restore the sequence of ligation events, there is a special field ``junction_index`` that can be reported as
a separate column of .pair file by setting ``--add-junction-index``. This field contains information on:

- the order of the junction in the recovered walk, starting from 5'-end of forward read
- type of the junction:

- "u" - unconfirmed junction, right and left alignments in the pair originate from different reads (forward or reverse). This might be indirect ligation (mediated by other DNA fragments).
- "f" - pair originates from the forward read. This is direct ligation.
- "r" - pair originated from the reverse read. Direct ligation.
- "b" - pair was sequenced at both forward and reverse read. Direct ligation.
With this information, the whole sequence of ligation events can be restored from the .pair file.


.. _section-single-ligation-rescue:
Expand Down
3 changes: 2 additions & 1 deletion pairtools/_pairsam_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
COL_SAM2 = 9

COLUMNS = ['readID', 'chrom1', 'pos1', 'chrom2', 'pos2',
'strand1', 'strand2', 'pair_type', 'sam1', 'sam2']
'strand1', 'strand2', 'pair_type', 'sam1', 'sam2',
'junction_index']

UNMAPPED_CHROM = '!'
UNMAPPED_POS = 0
Expand Down
111 changes: 51 additions & 60 deletions pairtools/_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def parse_sams_into_pair(sams1,
Two alignments selected for reporting as a Hi-C pair.
algns1, algns2
All alignments, sorted according to their order in on a read.
junction_index
Junction index of a pair in the molecule.
"""

# Check if there is at least one SAM entry per side:
Expand All @@ -30,7 +32,7 @@ def parse_sams_into_pair(sams1,
algns2 = [empty_alignment()]
algns1[0]['type'] = 'X'
algns2[0]['type'] = 'X'
return [ [algns1[0], algns2[0], algns1, algns2] ]
return [ [algns1[0], algns2[0], algns1, algns2, '1u'] ]

# Generate a sorted, gap-filled list of all alignments
algns1 = [parse_algn(sam.rstrip().split('\t'), min_mapq,
Expand All @@ -46,12 +48,6 @@ def parse_sams_into_pair(sams1,
_convert_gaps_into_alignments(algns1, max_inter_align_gap)
_convert_gaps_into_alignments(algns2, max_inter_align_gap)

# Enumerating the alignments for 'chimera_index' field
for i in range(len(algns1)):
algns1[i]['chimera_index'] = algns1[i]['type'] + "f" + str(i+1)
for i in range(len(algns2)):
algns2[i]['chimera_index'] = algns2[i]['type'] + "r" + str(i+1)

# Define the type of alignment on each side.
# The most important split is between chimeric alignments and linear
# alignments.
Expand All @@ -61,6 +57,8 @@ def parse_sams_into_pair(sams1,

hic_algn1 = algns1[0]
hic_algn2 = algns2[0]
# By default, assume each molecule is a single ligation with single unconfirmed junction:
junction_index = '1u'

# Parse chimeras
rescued_linear_side = None
Expand All @@ -74,15 +72,16 @@ def parse_sams_into_pair(sams1,
# Report only two alignments for a read pair
rescued_linear_side = rescue_walk(algns1, algns2, max_molecule_size)

# if the walk is unrescueable:
if rescued_linear_side is None:
# Walk was rescued as a simple walk:
if rescued_linear_side is not None:
junction_index = f'{1}{"f" if rescued_linear_side==1 else "r"}'
# Walk is unrescuable:
else:
if walks_policy == 'mask':
hic_algn1 = _mask_alignment(dict(hic_algn1))
hic_algn2 = _mask_alignment(dict(hic_algn2))
hic_algn1['type'] = 'W'
hic_algn2['type'] = 'W'
hic_algn1['chimera_index'] = 'W1'
hic_algn2['chimera_index'] = 'W1'

elif walks_policy == '5any':
hic_algn1 = algns1[0]
Expand Down Expand Up @@ -127,7 +126,7 @@ def parse_sams_into_pair(sams1,
hic_algn2 = dict(hic_algn2)
hic_algn2['type'] = hic_algn2['type'].lower()

return [ [hic_algn1, hic_algn2, algns1, algns2] ]
return [ [hic_algn1, hic_algn2, algns1, algns2, junction_index] ]


def parse_cigar(cigar):
Expand Down Expand Up @@ -196,7 +195,6 @@ def empty_alignment():
'clip5_ref': 0,
'read_len': 0,
'type':'N',
'chimera_index': 'N0'
}


Expand Down Expand Up @@ -257,7 +255,6 @@ def parse_algn(
'dist_to_5': dist_to_5,
'dist_to_3': dist_to_3,
'type': ('N' if not is_mapped else ('M' if not is_unique else 'U')),
'chimera_index': '0' # Index cannot be determined prior to full alignment parsing
}

algn.update(cigar)
Expand Down Expand Up @@ -371,13 +368,13 @@ def rescue_walk(algns1, algns2, max_molecule_size):
# in the output
algns1[1]['type'] = 'X'
algns2[0]['type'] = 'R'
return 2
return 1
else:
algns1[0]['type'] = 'R'
# changing the type of the 3' alignment on side 2, does not show up
# in the output
algns2[1]['type'] = 'X'
return 1
return 2
else:
return None

Expand All @@ -391,12 +388,9 @@ def rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset=3):
If one of the reads contains ligation junction, this might lead to reporting fake contact.
Thus, the pairs of contacts that overlap are paired-end duplicates and should be reported uniquely.
Return: list of all the rescued pairs after deduplication.
PP are pairs that appear at the end of reads and do not seem to be the same molecule (see ends_do_overlap).
JJ are pairs that have an observed ligation junction.
Return: list of all the rescued pairs after deduplication with junction index for each pair.
Example of iterative search:
Example of iterative search (note that it's for the illustration of the algorithm only):
Forward read: Reverse read:
----------------------> <-----------------------
Expand Down Expand Up @@ -483,62 +477,57 @@ def rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset=3):
if not is_overlap: # No overlap found, roll the current_idx_reverse back to the initial value
current_reverse_junction = 1

# current_reverse_junction is 1 if:
# no overlapping junctions found, or less than 2 chimeras in either forward or reverse read.
# In this case, we check whether the last alignments of forward and reverse reads overlap.
# If no overlapping junctions found, or there are less than 2 chimeras in either forward or reverse read,
# then current_reverse_junction is 1,
# check whether the last alignments of forward and reverse reads overlap.
if current_reverse_junction == 1:
last_reported_alignment_forward = last_reported_alignment_reverse = 1
if ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset):
# Report the modified last junctions:
if n_algns1 >= 2:
# "dict" trick to store the type of contact and not modify original entry:
# store the type of contact and do not modify original entry:
hic_algn1 = dict(algns1[-2])
hic_algn2 = dict(algns2[-1])
# Modify pos3 of reverse read alignment to correspond to actual observed 5' ends in forward read:
hic_algn2['pos3'] = algns1[-1]['pos5']
hic_algn1['type'] = ('n' if not hic_algn1['is_mapped'] else ('m' if not hic_algn1['is_unique'] else 'u'))
hic_algn2['type'] = ('n' if not hic_algn2['is_mapped'] else ('m' if not hic_algn2['is_unique'] else 'u'))
hic_algn1['chimera_index'] = hic_algn1['type'] + "f" + str(n_algns1-1) + "r0"
hic_algn2['chimera_index'] = hic_algn2['type'] + "f" + str(n_algns1) + "r" + str(n_algns2)
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2])
hic_algn1['type'] = ('N' if not hic_algn1['is_mapped'] else ('M' if not hic_algn1['is_unique'] else 'U'))
hic_algn2['type'] = ('N' if not hic_algn2['is_mapped'] else ('M' if not hic_algn2['is_unique'] else 'U'))
junction_index = f'{len(algns1)-1}f'
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2, junction_index])
last_reported_alignment_forward = 2
if n_algns2 >= 2:
# "dict" trick to store the type of contact and not modify original entry:
# store the type of contact and do not modify original entry:
hic_algn1 = dict(algns1[-1])
hic_algn2 = dict(algns2[-2])
# Modify pos3 of forward read alignment to correspond to actual observed 5' ends in reverse read:
hic_algn1['pos3'] = algns2[-1]['pos5']
hic_algn1['type'] = ('n' if not hic_algn1['is_mapped'] else ('m' if not hic_algn1['is_unique'] else 'u'))
hic_algn2['type'] = ('n' if not hic_algn2['is_mapped'] else ('m' if not hic_algn2['is_unique'] else 'u'))
hic_algn1['chimera_index'] = hic_algn1['type'] + "f" + str(n_algns1) + "r" + str(n_algns2)
hic_algn2['chimera_index'] = hic_algn2['type'] + "f0r" + str(n_algns2-1)
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2])
hic_algn1['type'] = ('N' if not hic_algn1['is_mapped'] else ('M' if not hic_algn1['is_unique'] else 'U'))
hic_algn2['type'] = ('N' if not hic_algn2['is_mapped'] else ('M' if not hic_algn2['is_unique'] else 'U'))
junction_index = f'{len(algns1)}r'
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2, junction_index])
last_reported_alignment_reverse = 2
# End alignments do not overlap. Terra incognita, no evidence of ligation junction, report regular upper-case pair:
# End alignments do not overlap. No evidence of ligation junction for the pair, report regular pair:
else:
hic_algn1 = dict(algns1[-1]) # "dict" trick to store the type of contact and not modify original entry
hic_algn2 = dict(algns2[-1])
hic_algn1['type'] = ('N' if not hic_algn1['is_mapped'] else ('M' if not hic_algn1['is_unique'] else 'U'))
hic_algn2['type'] = ('N' if not hic_algn2['is_mapped'] else ('M' if not hic_algn2['is_unique'] else 'U'))
hic_algn1['chimera_index'] = hic_algn1['type'] + "f" + str(n_algns1) + "r0"
hic_algn2['chimera_index'] = hic_algn2['type'] + "f0r" + str(n_algns2)
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2])
junction_index = f'{len(algns1)}u'
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2, junction_index])

# If we have an overlap of junctions:
else:
last_reported_alignment_forward = last_reported_alignment_reverse = current_reverse_junction

# Reporting all the sequential alignments
# Report all the sequential alignments
# Report all the sequential chimeric pairs in the forward read up to overlap:
for i in range(0, n_algns1-last_reported_alignment_forward):
hic_algn1 = dict(algns1[i])
hic_algn2 = dict(algns1[i+1])
hic_algn1['type'] = ('n' if not hic_algn1['is_mapped'] else ('m' if not hic_algn1['is_unique'] else 'u'))
hic_algn2['type'] = ('n' if not hic_algn2['is_mapped'] else ('m' if not hic_algn2['is_unique'] else 'u'))

hic_algn1['chimera_index'] = hic_algn1['type'] + "f" + str(i + 1) + "r0"
hic_algn2['chimera_index'] = hic_algn2['type'] + "f" + str(i + 2) + "r0"
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2])
hic_algn1['type'] = ('N' if not hic_algn1['is_mapped'] else ('M' if not hic_algn1['is_unique'] else 'U'))
hic_algn2['type'] = ('N' if not hic_algn2['is_mapped'] else ('M' if not hic_algn2['is_unique'] else 'U'))
junction_index = f'{i + 1}f'
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2, junction_index])

# Report the overlap
for i_overlapping in range(current_reverse_junction-1):
Expand All @@ -548,25 +537,24 @@ def rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset=3):
hic_algn1 = dict(algns1[idx_forward])
hic_algn2 = dict(algns1[idx_forward+1])
hic_algn2['pos3'] = algns2[idx_reverse-1]['pos5']
hic_algn1['type'] = ('n' if not hic_algn1['is_mapped'] else ('m' if not hic_algn1['is_unique'] else 'u'))
hic_algn2['type'] = ('n' if not hic_algn2['is_mapped'] else ('m' if not hic_algn2['is_unique'] else 'u'))
hic_algn1['chimera_index'] = hic_algn1['type'] + "f" + str(idx_forward) + "r" + str(idx_reverse)
hic_algn2['chimera_index'] = hic_algn2['type'] + "f" + str(idx_forward+1) + "r" + str(idx_reverse-1)
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2])
hic_algn1['type'] = ('N' if not hic_algn1['is_mapped'] else ('M' if not hic_algn1['is_unique'] else 'U'))
hic_algn2['type'] = ('N' if not hic_algn2['is_mapped'] else ('M' if not hic_algn2['is_unique'] else 'U'))
junction_index = f'{idx_forward + 1}b'
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2, junction_index])

# Report all the sequential chimeric pairs in the reverse read, but not the overlap:
for i in range(0, min(current_reverse_junction, n_algns2 - last_reported_alignment_reverse)):
hic_algn1 = dict(algns2[i])
hic_algn2 = dict(algns2[i + 1])
hic_algn1['type'] = ('n' if not hic_algn1['is_mapped'] else ('m' if not hic_algn1['is_unique'] else 'u'))
hic_algn2['type'] = ('n' if not hic_algn2['is_mapped'] else ('m' if not hic_algn2['is_unique'] else 'u'))
hic_algn1['chimera_index'] = hic_algn1['type'] + "f0r" + str(i + 1)
hic_algn2['chimera_index'] = hic_algn2['type'] + "f0r" + str(i + 2)
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2])
hic_algn1['type'] = ('N' if not hic_algn1['is_mapped'] else ('M' if not hic_algn1['is_unique'] else 'U'))
hic_algn2['type'] = ('N' if not hic_algn2['is_mapped'] else ('M' if not hic_algn2['is_unique'] else 'U'))
junction_index = f'{n_algns1 + min(current_reverse_junction, n_algns2 - last_reported_alignment_reverse) - i - (1 if current_reverse_junction>1 else 0)}r'
final_contacts.append([hic_algn1, hic_algn2, algns1, algns2, junction_index])

final_contacts.sort(key = lambda x: int(x[-1][:-1]) )
return final_contacts

### Additional functions to rescue complex walks ###
### Additional functions for complex walks rescue ###
def ends_do_overlap(algn1, algn2, max_molecule_size=500, allowed_offset=5):
"""
Two ends of alignments overlap if:
Expand Down Expand Up @@ -796,8 +784,8 @@ def write_all_algnments(readID, all_algns1, all_algns2, out_file):


def write_pairsam(
algn1, algn2, readID, sams1, sams2, out_file,
drop_readid, drop_sam, add_columns):
algn1, algn2, readID, junction_index, sams1, sams2, out_file,
drop_readid, drop_sam, add_junction_index, add_columns):
"""
SAM is already tab-separated and
any printable character between ! and ~ may appear in the PHRED field!
Expand Down Expand Up @@ -826,6 +814,9 @@ def write_pairsam(
])
)

if add_junction_index:
cols.append(junction_index)

for col in add_columns:
# use get b/c empty alignments would not have sam tags (NM, AS, etc)
cols.append(str(algn1.get(col, '')))
Expand Down
24 changes: 16 additions & 8 deletions pairtools/pairtools_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
'algn_read_span',
'dist_to_5',
'dist_to_3',
'seq',
'chimera_index'
'seq'
]

@cli.command()
Expand Down Expand Up @@ -80,6 +79,10 @@
"--drop-sam",
is_flag=True,
help='If specified, do not add sams to the output')
@click.option(
"--add-junction-index",
is_flag=True,
help='If specified, each pair will have junction index in the molecule')
@click.option(
"--add-columns",
type=click.STRING,
Expand Down Expand Up @@ -156,20 +159,20 @@
@common_io_options

def parse(sam_path, chroms_path, output, assembly, min_mapq, max_molecule_size,
drop_readid, drop_seq, drop_sam, add_columns,
drop_readid, drop_seq, drop_sam, add_junction_index, add_columns,
output_parsed_alignments, output_stats, **kwargs):
'''Find ligation junctions in .sam, make .pairs.
SAM_PATH : an input .sam/.bam file with paired-end sequence alignments of
Hi-C molecules. If the path ends with .bam, the input is decompressed from
bam with samtools. By default, the input is read from stdin.
'''
parse_py(sam_path, chroms_path, output, assembly, min_mapq, max_molecule_size,
drop_readid, drop_seq, drop_sam, add_columns,
drop_readid, drop_seq, drop_sam, add_junction_index, add_columns,
output_parsed_alignments, output_stats, **kwargs)


def parse_py(sam_path, chroms_path, output, assembly, min_mapq, max_molecule_size,
drop_readid, drop_seq, drop_sam, add_columns,
drop_readid, drop_seq, drop_sam, add_junction_index, add_columns,
output_parsed_alignments, output_stats, **kwargs):
instream = (_fileio.auto_open(sam_path, mode='r',
nproc=kwargs.get('nproc_in'),
Expand Down Expand Up @@ -217,6 +220,9 @@ def parse_py(sam_path, chroms_path, output, assembly, min_mapq, max_molecule_siz
columns.pop(columns.index('sam1'))
columns.pop(columns.index('sam2'))

if not add_junction_index:
columns.pop(columns.index('junction_index'))

header = _headerops.make_standard_pairsheader(
assembly = assembly,
chromsizes = [(chrom, sam_chromsizes[chrom]) for chrom in chromosomes],
Expand All @@ -230,7 +236,7 @@ def parse_py(sam_path, chroms_path, output, assembly, min_mapq, max_molecule_siz
outstream.writelines((l+'\n' for l in header))

streaming_classify(body_stream, outstream, chromosomes, min_mapq,
max_molecule_size, drop_readid, drop_seq, drop_sam,
max_molecule_size, drop_readid, drop_seq, drop_sam, add_junction_index,
add_columns, out_alignments_stream, out_stat, **kwargs)

# save statistics to a file if it was requested:
Expand All @@ -249,7 +255,7 @@ def parse_py(sam_path, chroms_path, output, assembly, min_mapq, max_molecule_siz


def streaming_classify(instream, outstream, chromosomes, min_mapq, max_molecule_size,
drop_readid, drop_seq, drop_sam, add_columns,
drop_readid, drop_seq, drop_sam, add_junction_index, add_columns,
out_alignments_stream, out_stat, **kwargs):
"""
"""
Expand All @@ -276,7 +282,7 @@ def streaming_classify(instream, outstream, chromosomes, min_mapq, max_molecule_

if not(line) or ((readID != prev_readID) and prev_readID):

for algn1, algn2, all_algns1, all_algns2 in _parse.parse_sams_into_pair(
for algn1, algn2, all_algns1, all_algns2, junction_index in _parse.parse_sams_into_pair(
sams1,
sams2,
min_mapq,
Expand All @@ -298,10 +304,12 @@ def streaming_classify(instream, outstream, chromosomes, min_mapq, max_molecule_
_parse.write_pairsam(
algn1, algn2,
prev_readID,
junction_index,
sams1, sams2,
outstream,
drop_readid,
drop_sam,
add_junction_index,
add_columns)

# add a pair to PairCounter if stats output is requested:
Expand Down
Loading

0 comments on commit 5d0c9cf

Please sign in to comment.