open2c · golobor · Dec 22, 2017 · Dec 15, 2017 · Dec 15, 2017 · Dec 15, 2017
diff --git a/pairsamtools/pairsam_dedup.py b/pairsamtools/pairsam_dedup.py
@@ -11,6 +11,7 @@
 
 from . import _dedup, _fileio, _pairsam_format, _headerops, cli, common_io_options
 from .pairsam_markasdup import mark_split_pair_as_dup
+from .pairsam_stats import PairCounter
 
 
 UTIL_NAME = 'pairsam_dedup'
@@ -172,6 +173,23 @@ def dedup_py(
                                    nproc=kwargs.get('nproc_out'),
                                    command=kwargs.get('cmd_out', None)) 
                  if output else sys.stdout)
+    out_stats_stream = (_fileio.auto_open(output_stats, mode='w', 
+                           nproc=kwargs.get('nproc_out'),
+                           command=kwargs.get('cmd_out', None)) 
+             if output_stats else None)
+    # Previous way of doing dedup-stats ...
+    # if output_stats:
+    #     stat_f = _fileio.auto_open(output_stats, mode='a',
+    #                                nproc=kwargs.get('nproc_out'),
+    #                                command=kwargs.get('cmd_out', None))
+    #     stat_f.write('{}\t{}\n'.format('dedup/n_unmapped', n_unmapped))
+    #     stat_f.write('{}\t{}\n'.format('dedup/n_dups', n_dups))
+    #     stat_f.write('{}\t{}\n'.format('dedup/n_nodups', n_nodups))
+    #     stat_f.close()
+
+    # generate empty PairCounter if stats output is requested:
+    out_stat = PairCounter() if output_stats else None
+
 
     if not output_dups:
         outstream_dups = None
@@ -207,19 +225,14 @@ def dedup_py(
             and (outstream_unmapped != outstream_dups)):
         outstream_unmapped.writelines((l+'\n' for l in header))
 
-    n_unmapped, n_dups, n_nodups = streaming_dedup(
-        method, max_mismatch, sep, 
+    streaming_dedup( method, max_mismatch, sep,
         c1, c2, p1, p2, s1, s2, unmapped_chrom,
-        body_stream, outstream, outstream_dups, outstream_unmapped, mark_dups)
+        body_stream, outstream, outstream_dups,
+        outstream_unmapped, out_stat, mark_dups)
 
-    if output_stats:
-        stat_f = _fileio.auto_open(output_stats, mode='a',
-                                   nproc=kwargs.get('nproc_out'),
-                                   command=kwargs.get('cmd_out', None))
-        stat_f.write('{}\t{}\n'.format('dedup/n_unmapped', n_unmapped))
-        stat_f.write('{}\t{}\n'.format('dedup/n_dups', n_dups))
-        stat_f.write('{}\t{}\n'.format('dedup/n_nodups', n_nodups))
-        stat_f.close()
+    # save statistics to a file if it was requested:
+    if out_stat:
+        out_stat.save(out_stats_stream)
 
     if instream != sys.stdin:
         instream.close()
@@ -234,6 +247,10 @@ def dedup_py(
             and (outstream_unmapped != outstream_dups)):
         outstream_unmapped.close()
 
+    if out_stats_stream:
+        out_stats_stream.close()
+
+
 def fetchadd(key, mydict):
     key = key.strip()
     if key not in mydict:
@@ -250,9 +267,15 @@ def streaming_dedup(
         c1ind, c2ind, p1ind, p2ind, s1ind, s2ind,
         unmapped_chrom,
         instream, outstream, outstream_dups, outstream_unmapped,
-        mark_dups):
+        out_stat, mark_dups):
     maxind = max(c1ind, c2ind, p1ind, p2ind, s1ind, s2ind)
 
+    # if we do stats in the dedup, we need PAIR_TYPE
+    # i do not see way around this:
+    if out_stat:
+        ptind = _pairsam_format.COL_PTYPE
+        maxind = max(maxind, ptind)
+
     dd = _dedup.OnlineDuplicateDetector(method, max_mismatch, returnData=False)
 
     c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = []
@@ -285,13 +308,21 @@ def streaming_dedup(
                 or (cols[c2ind] == unmapped_chrom)):
 
                 if outstream_unmapped:
-                    outstream_unmapped.write(line)  
-                n_unmapped += 1
+                    outstream_unmapped.write(line)
+
+                # add a pair to PairCounter if stats output is requested:
+                if out_stat:
+                    out_stat.add_pair(cols[c1ind],  int(cols[p1ind]),  cols[s1ind],
+                                      cols[c2ind],  int(cols[p2ind]),  cols[s2ind],
+                                      cols[ptind])
+                # # to be removed: old way of doing dedup stats
+                # n_unmapped += 1
 
             else:
                 line_buffer.append(line)
-                if mark_dups:
-                    cols_buffer.append(cols)
+                # do cols_buffer all the time:
+                # if mark_dups:
+                cols_buffer.append(cols)
 
                 c1.append(fetchadd(cols[c1ind], chromDict))
                 c2.append(fetchadd(cols[c2ind], chromDict))
@@ -310,32 +341,62 @@ def streaming_dedup(
             if not line:
                 res = np.concatenate([res, dd.finish()])
 
-            for i in range(len(res)): 
+            for i in range(len(res)):
+                # not duplicated pair:
                 if not res[i]:
-                    outstream.write(line_buffer[i])  
-                    n_nodups += 1
+                    outstream.write(line_buffer[i])
+                    if out_stat:
+                        out_stat.add_pair(cols_buffer[i][c1ind],
+                                          int(cols_buffer[i][p1ind]),
+                                          cols_buffer[i][s1ind],
+                                          cols_buffer[i][c2ind],
+                                          int(cols_buffer[i][p2ind]),
+                                          cols_buffer[i][s2ind],
+                                          cols_buffer[i][ptind])
+                    # # to be removed: old way of doing dedup stats
+                    # n_nodups += 1
+                # duplicated pair:
                 else:
-                    n_dups += 1
+                    if out_stat:
+                        out_stat.add_pair(cols_buffer[i][c1ind],
+                                          int(cols_buffer[i][p1ind]),
+                                          cols_buffer[i][s1ind],
+                                          cols_buffer[i][c2ind],
+                                          int(cols_buffer[i][p2ind]),
+                                          cols_buffer[i][s2ind],
+                                          'DD' )
+                    # # to be removed: old way of doing dedup stats
+                    # n_dups += 1
                     if outstream_dups:
-                        if mark_dups:
-                            outstream_dups.write(sep.join(
-                                mark_split_pair_as_dup(cols_buffer[i])))
-                        else:
-                            outstream_dups.write(line_buffer[i])
+                        outstream_dups.write(
+                          # DD-marked pair:
+                          sep.join(mark_split_pair_as_dup(cols_buffer[i])) if mark_dups
+                          # pair as is:
+                          else line_buffer[i] )
+                    # # to be removed:
+                    # if outstream_dups:
+                    #     if mark_dups:
+                    #         outstream_dups.write(sep.join(
+                    #             mark_split_pair_as_dup(cols_buffer[i])))
+                    #     else:
+                    #         outstream_dups.write(line_buffer[i])
+
+
 
             c1 = []; c2 = []; p1 = []; p2 = []; s1 = []; s2 = []
             line_buffer = line_buffer[len(res):]
-            if mark_dups:
-                cols_buffer = cols_buffer[len(res):]
+            # do cols_buffer all the time:
+            # if mark_dups:
+            cols_buffer = cols_buffer[len(res):]
             if not line:
                 if(len(line_buffer) != 0):                
                     raise ValueError(
                         "{} lines left in the buffer, ".format(len(line_buffer))
                         + "should be none;"
                         + "something went terribly wrong")
                 break
-
-    return n_unmapped, n_dups, n_nodups
+    # do not return dup/dedup/unmapped anymore
+    # return n_unmapped, n_dups, n_nodups
 
 
 if __name__ == '__main__':

diff --git a/pairsamtools/pairsam_stats.py b/pairsamtools/pairsam_stats.py
@@ -158,10 +158,17 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
         self._stat['total'] = 0
         self._stat['total_unmapped'] = 0
         self._stat['total_single_sided_mapped'] = 0
+        # total_mapped = total_dups + total_nodups
         self._stat['total_mapped'] = 0
+        self._stat['total_dups'] = 0
+        self._stat['total_nodups'] = 0
+        ########################################
+        # the rest of stats are based on nodups:
+        ########################################
         self._stat['cis'] = 0
         self._stat['trans'] = 0
         self._stat['pair_types'] = {}
+        # to be removed:
         self._stat['dedup'] = {}
 
         self._stat['cis_1kb+'] = 0
@@ -357,34 +364,40 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
         """
 
         self._stat['total'] += 1
+        # collect pair type stats including DD:
         self._stat['pair_types'][pair_type] = self._stat['pair_types'].get(pair_type,0) + 1
         if chrom1 == '!' and chrom2 == '!':
             self._stat['total_unmapped'] += 1
         elif chrom1 != '!' and chrom2 != '!':
-            self._stat['chrom_freq'][(chrom1, chrom2)] = (
-                self._stat['chrom_freq'].get((chrom1, chrom2), 0) + 1)
             self._stat['total_mapped'] += 1
-
-            if chrom1 == chrom2:
-                self._stat['cis'] += 1
-                dist = np.abs(pos2-pos1)
-                bin_idx = np.searchsorted(self._dist_bins, dist, 'right') - 1
-                self._stat['dist_freq'][strand1+strand2][bin_idx] += 1
-                if dist >= 1000:
-                    self._stat['cis_1kb+'] += 1
-                if dist >= 2000:
-                    self._stat['cis_2kb+'] += 1
-                if dist >= 4000:
-                    self._stat['cis_4kb+'] += 1
-                if dist >= 10000:
-                    self._stat['cis_10kb+'] += 1
-                if dist >= 20000:
-                    self._stat['cis_20kb+'] += 1
-                if dist >= 40000:
-                    self._stat['cis_40kb+'] += 1
-
+            # only mapped ones can be duplicates:
+            if pair_type == 'DD':
+                self._stat['total_dups'] += 1
             else:
-                self._stat['trans'] += 1
+                self._stat['total_nodups'] += 1
+                self._stat['chrom_freq'][(chrom1, chrom2)] = (
+                    self._stat['chrom_freq'].get((chrom1, chrom2), 0) + 1)
+
+                if chrom1 == chrom2:
+                    self._stat['cis'] += 1
+                    dist = np.abs(pos2-pos1)
+                    bin_idx = np.searchsorted(self._dist_bins, dist, 'right') - 1
+                    self._stat['dist_freq'][strand1+strand2][bin_idx] += 1
+                    if dist >= 1000:
+                        self._stat['cis_1kb+'] += 1
+                    if dist >= 2000:
+                        self._stat['cis_2kb+'] += 1
+                    if dist >= 4000:
+                        self._stat['cis_4kb+'] += 1
+                    if dist >= 10000:
+                        self._stat['cis_10kb+'] += 1
+                    if dist >= 20000:
+                        self._stat['cis_20kb+'] += 1
+                    if dist >= 40000:
+                        self._stat['cis_40kb+'] += 1
+
+                else:
+                    self._stat['trans'] += 1
         else:
             self._stat['total_single_sided_mapped'] += 1