updates readme

BioGenies · Jul 22, 2024 · 9b62654 · 9b62654
1 parent 8152ea4
commit 9b62654
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 32 deletions.
diff --git a/README.Rmd b/README.Rmd
@@ -21,7 +21,10 @@ knitr::opts_chunk$set(
 
 ## Overview 
 
-The kmerFilters is an R package providing tools for simulating k-mer data and benchmarking various filtering techniques. It is designed to assist researchers and practitioners in evaluating and comparing different approaches for preprocessing sequence data, particularly for applications such as protein function prediction.
+k-mers (n-grams) refer to k-length substrings derived from longer sequences, which can be continuous (representing a block of subsequent residues) or discontinuous (where the wildcards representing the gaps between residues are allowed). In biological applications, k-mers are used for various purposes, including genome assembly, sequence alignment, motif discovery, variant calling, phylogenetics, and CRISPR target identification. Due to the vast number of variables introduced by k-mer notation, we need tools to filter the variables.
+
+In this package we provide tools for simulating k-mer data and benchmarking various filtering techniques. It is designed to assist researchers and practitioners in evaluating and comparing different approaches for preprocessing sequence data, particularly for applications such as protein function prediction.
+
 
 ## Features
 

diff --git a/README.md b/README.md
@@ -11,10 +11,19 @@ coverage](https://codecov.io/gh/BioGenies/kmerFilters/branch/main/graph/badge.sv
 
 ## Overview
 
-The kmerFilters is an R package providing tools for simulating k-mer
-data and benchmarking various filtering techniques. It is designed to
-assist researchers and practitioners in evaluating and comparing
-different approaches for preprocessing sequence data, particularly for
+k-mers (n-grams) refer to k-length substrings derived from longer
+sequences, which can be continuous (representing a block of subsequent
+residues) or discontinuous (where the wildcards representing the gaps
+between residues are allowed). In biological applications, k-mers are
+used for various purposes, including genome assembly, sequence
+alignment, motif discovery, variant calling, phylogenetics, and CRISPR
+target identification. Due to the vast number of variables introduced by
+k-mer notation, we need tools to filter the variables.
+
+In this package we provide tools for simulating k-mer data and
+benchmarking various filtering techniques. It is designed to assist
+researchers and practitioners in evaluating and comparing different
+approaches for preprocessing sequence data, particularly for
 applications such as protein function prediction.
 
 ## Features
@@ -67,16 +76,16 @@ motifs <- generate_motifs(alphabet = alph,
 
 motifs
 #> [[1]]
-#> [1] "d" "_" "c" "_" "_" "_" "d"
+#> [1] "d" "_" "_" "_" "_" "_" "b"
 #> 
 #> [[2]]
-#>  [1] "d" "_" "_" "_" "_" "d" "_" "a" "_" "d"
+#> [1] "c" "_" "_" "d" "c"
 #> 
 #> [[3]]
-#> [1] "a" "_" "_" "d"
+#> [1] "b" "_" "c" "_" "_" "c"
 #> 
 #> [[4]]
-#> [1] "c" "_" "d" "_" "_" "a"
+#> [1] "c" "_" "_" "b" "_" "_" "c"
 ```
 
 Using simulated motifs we can simulate positive and negative sequences
@@ -89,31 +98,31 @@ results <- generate_kmer_data(n_seq = 20,
                               motifs = motifs, 
                               n_injections = 4)
 results
-#> 20 x 14659 sparse Matrix of class "dgCMatrix"
+#> 20 x 14940 sparse Matrix of class "dgCMatrix"
 #>   [[ suppressing 33 column names 'a', 'd', 'b' ... ]]
 #>                                                                               
-#>  [1,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . . . 1 1 1 1 1 1 1 1 1 1 1 . . ......
-#>  [2,] 1 1 1 1 1 1 1 . 1 1 1 1 1 1 . . . 1 1 1 1 1 1 1 1 . 1 1 . 1 . 1 1 ......
-#>  [3,] 1 1 1 1 1 1 . 1 . 1 1 1 1 1 1 . . . . 1 1 1 1 1 1 . . 1 . 1 . 1 1 ......
-#>  [4,] 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 . 1 1 . . 1 1 1 1 1 . 1 1 . 1 . 1 1 ......
-#>  [5,] 1 1 1 1 1 1 . . 1 1 . 1 1 1 1 1 1 . 1 . 1 1 1 . . 1 . 1 . 1 . . . ......
-#>  [6,] 1 1 1 1 1 1 1 1 1 . 1 1 1 1 1 1 1 . . . 1 1 1 1 1 1 . . 1 1 . . 1 ......
-#>  [7,] 1 1 1 1 1 1 1 . 1 1 1 1 . 1 1 . . . . . 1 1 1 . 1 . . 1 . 1 . . 1 ......
-#>  [8,] 1 1 1 1 1 1 1 . . 1 1 1 . 1 1 . . 1 1 . 1 1 1 . 1 . 1 1 1 1 . . 1 ......
-#>  [9,] 1 1 1 1 . 1 1 1 1 1 1 1 1 1 1 1 1 . 1 . 1 1 1 . 1 1 . 1 1 1 . 1 . ......
-#> [10,] 1 1 1 1 1 1 . 1 1 1 . . . 1 1 1 1 . 1 1 1 1 . 1 1 . . 1 . 1 1 . 1 ......
-#> [11,] 1 1 1 1 1 1 . . 1 1 . 1 1 . 1 . 1 . 1 1 1 1 . . 1 1 1 . . 1 . . . ......
-#> [12,] 1 1 1 1 . . . 1 1 1 1 1 1 1 1 1 1 . . . 1 . 1 1 1 1 1 . 1 1 . 1 . ......
-#> [13,] 1 1 1 1 1 1 1 1 . 1 1 1 . 1 . . . 1 1 1 . 1 1 1 . . . 1 1 . . 1 1 ......
-#> [14,] 1 1 1 1 . . . 1 1 1 . 1 1 1 1 1 1 . 1 . 1 1 1 1 1 1 . 1 . . 1 1 1 ......
-#> [15,] 1 1 1 1 1 1 1 . . 1 1 1 . 1 1 1 . . 1 . . 1 . 1 1 . 1 1 . 1 1 1 1 ......
-#> [16,] 1 1 1 1 . . . 1 . 1 1 1 1 1 . 1 1 1 . 1 1 1 1 1 . 1 1 . 1 1 1 1 . ......
-#> [17,] 1 1 1 1 1 . . 1 1 . . 1 1 1 1 . . 1 1 1 . . 1 1 1 . 1 . 1 1 . 1 1 ......
-#> [18,] 1 1 1 1 1 . 1 1 1 1 1 1 1 1 1 . 1 1 1 1 . . . . 1 1 1 1 . 1 1 1 1 ......
-#> [19,] 1 1 1 1 . 1 1 1 . . 1 1 1 1 . 1 1 . 1 1 1 . . . 1 . . 1 1 . 1 1 1 ......
-#> [20,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 . . 1 1 . . 1 1 1 1 1 . 1 1 1 ......
+#>  [1,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . . . . . . 1 1 1 1 1 1 1 1 1 1 1 1 . ......
+#>  [2,] 1 1 1 1 . 1 1 1 1 1 1 1 1 1 1 1 1 . . . 1 1 . 1 1 1 1 1 . 1 . 1 1 ......
+#>  [3,] 1 1 1 1 . 1 . 1 . . 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 1 1 . 1 1 1 ......
+#>  [4,] 1 1 1 1 . 1 1 1 1 1 . 1 1 . 1 1 1 1 1 1 . 1 1 1 . 1 . 1 1 1 1 1 1 ......
+#>  [5,] 1 1 1 1 1 1 1 . 1 1 1 1 1 1 . 1 1 1 . . 1 1 1 . 1 1 . 1 1 1 1 1 . ......
+#>  [6,] 1 1 1 1 1 . 1 1 1 1 1 1 . 1 . 1 . 1 1 . 1 1 . 1 . . . 1 1 1 1 . 1 ......
+#>  [7,] 1 1 1 1 1 1 . 1 1 . 1 1 1 1 1 1 . 1 1 1 . 1 1 . 1 1 . 1 1 1 1 . . ......
+#>  [8,] 1 1 1 1 1 . 1 . 1 1 1 1 . 1 . 1 1 . . . 1 . . 1 1 . 1 1 . 1 1 1 . ......
+#>  [9,] 1 1 1 1 1 1 . 1 1 1 1 1 1 1 . 1 . . 1 1 1 . . 1 1 . 1 1 1 1 1 1 1 ......
+#> [10,] 1 1 1 1 1 1 1 1 . 1 1 1 1 . . . 1 1 . . 1 1 1 . 1 1 1 1 . 1 1 1 . ......
+#> [11,] 1 1 1 1 . . . 1 . 1 . 1 1 1 1 1 1 1 1 . 1 1 1 1 1 1 1 1 1 . 1 . . ......
+#> [12,] 1 1 1 1 1 1 1 . . 1 1 1 1 1 1 1 1 . 1 . 1 1 . . 1 1 1 1 . 1 1 1 . ......
+#> [13,] 1 1 1 1 1 1 1 . 1 1 . 1 . 1 1 . 1 1 1 1 1 1 . . . 1 . 1 1 1 1 1 1 ......
+#> [14,] 1 1 1 1 1 . 1 1 1 . 1 1 1 1 1 1 . . 1 1 1 1 1 1 . 1 1 1 1 1 . 1 . ......
+#> [15,] 1 1 . 1 1 1 . 1 . 1 1 . 1 . . . 1 . 1 1 1 . 1 . 1 . 1 . 1 1 . 1 1 ......
+#> [16,] 1 1 1 1 . 1 1 . 1 1 . 1 . 1 . 1 1 1 1 . 1 . 1 . . 1 . 1 . 1 1 . . ......
+#> [17,] 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 . . 1 1 . 1 1 1 1 1 1 . . 1 1 1 1 1 ......
+#> [18,] 1 1 1 1 1 1 1 1 1 1 . 1 1 . 1 . . 1 1 1 1 1 . 1 . 1 . 1 1 1 . 1 1 ......
+#> [19,] 1 1 1 1 . . 1 1 . 1 . 1 1 1 1 1 1 1 1 1 . 1 . 1 . . . 1 1 1 1 . 1 ......
+#> [20,] 1 1 1 1 1 . 1 1 1 . 1 1 1 1 1 1 . 1 1 1 1 1 . . . 1 1 1 1 1 1 . 1 ......
 #> 
-#>  .....suppressing 14626 columns in show(); maybe adjust 'options(max.print= *, width = *)'
+#>  .....suppressing 14907 columns in show(); maybe adjust 'options(max.print= *, width = *)'
 #>  ..............................
 ```
 
@@ -128,7 +137,7 @@ For example, the following code:
 
 ``` r
 get_target_additive(results)
-#>  [1] 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 0 1 0 0 0
+#>  [1] 0 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 ```
 
 creates a binary response variable based on the logistic regression