From 9b62654efa84970a6642d782b8839469b13b45f4 Mon Sep 17 00:00:00 2001 From: KrystynaGrzesiak Date: Mon, 22 Jul 2024 15:04:50 +0200 Subject: [PATCH] updates readme --- README.Rmd | 5 +++- README.md | 71 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/README.Rmd b/README.Rmd index 3490c05..222f04f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -21,7 +21,10 @@ knitr::opts_chunk$set( ## Overview -The kmerFilters is an R package providing tools for simulating k-mer data and benchmarking various filtering techniques. It is designed to assist researchers and practitioners in evaluating and comparing different approaches for preprocessing sequence data, particularly for applications such as protein function prediction. +k-mers (n-grams) refer to k-length substrings derived from longer sequences, which can be continuous (representing a block of subsequent residues) or discontinuous (where the wildcards representing the gaps between residues are allowed). In biological applications, k-mers are used for various purposes, including genome assembly, sequence alignment, motif discovery, variant calling, phylogenetics, and CRISPR target identification. Due to the vast number of variables introduced by k-mer notation, we need tools to filter the variables. + +In this package we provide tools for simulating k-mer data and benchmarking various filtering techniques. It is designed to assist researchers and practitioners in evaluating and comparing different approaches for preprocessing sequence data, particularly for applications such as protein function prediction. + ## Features diff --git a/README.md b/README.md index dcfc8ed..9ec1d01 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,19 @@ coverage](https://codecov.io/gh/BioGenies/kmerFilters/branch/main/graph/badge.sv ## Overview -The kmerFilters is an R package providing tools for simulating k-mer -data and benchmarking various filtering techniques. It is designed to -assist researchers and practitioners in evaluating and comparing -different approaches for preprocessing sequence data, particularly for +k-mers (n-grams) refer to k-length substrings derived from longer +sequences, which can be continuous (representing a block of subsequent +residues) or discontinuous (where the wildcards representing the gaps +between residues are allowed). In biological applications, k-mers are +used for various purposes, including genome assembly, sequence +alignment, motif discovery, variant calling, phylogenetics, and CRISPR +target identification. Due to the vast number of variables introduced by +k-mer notation, we need tools to filter the variables. + +In this package we provide tools for simulating k-mer data and +benchmarking various filtering techniques. It is designed to assist +researchers and practitioners in evaluating and comparing different +approaches for preprocessing sequence data, particularly for applications such as protein function prediction. ## Features @@ -67,16 +76,16 @@ motifs <- generate_motifs(alphabet = alph, motifs #> [[1]] -#> [1] "d" "_" "c" "_" "_" "_" "d" +#> [1] "d" "_" "_" "_" "_" "_" "b" #> #> [[2]] -#> [1] "d" "_" "_" "_" "_" "d" "_" "a" "_" "d" +#> [1] "c" "_" "_" "d" "c" #> #> [[3]] -#> [1] "a" "_" "_" "d" +#> [1] "b" "_" "c" "_" "_" "c" #> #> [[4]] -#> [1] "c" "_" "d" "_" "_" "a" +#> [1] "c" "_" "_" "b" "_" "_" "c" ``` Using simulated motifs we can simulate positive and negative sequences @@ -89,31 +98,31 @@ results <- generate_kmer_data(n_seq = 20, motifs = motifs, n_injections = 4) results -#> 20 x 14659 sparse Matrix of class "dgCMatrix" +#> 20 x 14940 sparse Matrix of class "dgCMatrix" #> [[ suppressing 33 column names 'a', 'd', 'b' ... ]] #> -#> [1,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . . . 1 1 1 1 1 1 1 1 1 1 1 . . ...... -#> [2,] 1 1 1 1 1 1 1 . 1 1 1 1 1 1 . . . 1 1 1 1 1 1 1 1 . 1 1 . 1 . 1 1 ...... -#> [3,] 1 1 1 1 1 1 . 1 . 1 1 1 1 1 1 . . . . 1 1 1 1 1 1 . . 1 . 1 . 1 1 ...... -#> [4,] 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 . 1 1 . . 1 1 1 1 1 . 1 1 . 1 . 1 1 ...... -#> [5,] 1 1 1 1 1 1 . . 1 1 . 1 1 1 1 1 1 . 1 . 1 1 1 . . 1 . 1 . 1 . . . ...... -#> [6,] 1 1 1 1 1 1 1 1 1 . 1 1 1 1 1 1 1 . . . 1 1 1 1 1 1 . . 1 1 . . 1 ...... -#> [7,] 1 1 1 1 1 1 1 . 1 1 1 1 . 1 1 . . . . . 1 1 1 . 1 . . 1 . 1 . . 1 ...... -#> [8,] 1 1 1 1 1 1 1 . . 1 1 1 . 1 1 . . 1 1 . 1 1 1 . 1 . 1 1 1 1 . . 1 ...... -#> [9,] 1 1 1 1 . 1 1 1 1 1 1 1 1 1 1 1 1 . 1 . 1 1 1 . 1 1 . 1 1 1 . 1 . ...... -#> [10,] 1 1 1 1 1 1 . 1 1 1 . . . 1 1 1 1 . 1 1 1 1 . 1 1 . . 1 . 1 1 . 1 ...... -#> [11,] 1 1 1 1 1 1 . . 1 1 . 1 1 . 1 . 1 . 1 1 1 1 . . 1 1 1 . . 1 . . . ...... -#> [12,] 1 1 1 1 . . . 1 1 1 1 1 1 1 1 1 1 . . . 1 . 1 1 1 1 1 . 1 1 . 1 . ...... -#> [13,] 1 1 1 1 1 1 1 1 . 1 1 1 . 1 . . . 1 1 1 . 1 1 1 . . . 1 1 . . 1 1 ...... -#> [14,] 1 1 1 1 . . . 1 1 1 . 1 1 1 1 1 1 . 1 . 1 1 1 1 1 1 . 1 . . 1 1 1 ...... -#> [15,] 1 1 1 1 1 1 1 . . 1 1 1 . 1 1 1 . . 1 . . 1 . 1 1 . 1 1 . 1 1 1 1 ...... -#> [16,] 1 1 1 1 . . . 1 . 1 1 1 1 1 . 1 1 1 . 1 1 1 1 1 . 1 1 . 1 1 1 1 . ...... -#> [17,] 1 1 1 1 1 . . 1 1 . . 1 1 1 1 . . 1 1 1 . . 1 1 1 . 1 . 1 1 . 1 1 ...... -#> [18,] 1 1 1 1 1 . 1 1 1 1 1 1 1 1 1 . 1 1 1 1 . . . . 1 1 1 1 . 1 1 1 1 ...... -#> [19,] 1 1 1 1 . 1 1 1 . . 1 1 1 1 . 1 1 . 1 1 1 . . . 1 . . 1 1 . 1 1 1 ...... -#> [20,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 . . 1 1 . . 1 1 1 1 1 . 1 1 1 ...... +#> [1,] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . . . . . . 1 1 1 1 1 1 1 1 1 1 1 1 . ...... +#> [2,] 1 1 1 1 . 1 1 1 1 1 1 1 1 1 1 1 1 . . . 1 1 . 1 1 1 1 1 . 1 . 1 1 ...... +#> [3,] 1 1 1 1 . 1 . 1 . . 1 1 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 1 1 . 1 1 1 ...... +#> [4,] 1 1 1 1 . 1 1 1 1 1 . 1 1 . 1 1 1 1 1 1 . 1 1 1 . 1 . 1 1 1 1 1 1 ...... +#> [5,] 1 1 1 1 1 1 1 . 1 1 1 1 1 1 . 1 1 1 . . 1 1 1 . 1 1 . 1 1 1 1 1 . ...... +#> [6,] 1 1 1 1 1 . 1 1 1 1 1 1 . 1 . 1 . 1 1 . 1 1 . 1 . . . 1 1 1 1 . 1 ...... +#> [7,] 1 1 1 1 1 1 . 1 1 . 1 1 1 1 1 1 . 1 1 1 . 1 1 . 1 1 . 1 1 1 1 . . ...... +#> [8,] 1 1 1 1 1 . 1 . 1 1 1 1 . 1 . 1 1 . . . 1 . . 1 1 . 1 1 . 1 1 1 . ...... +#> [9,] 1 1 1 1 1 1 . 1 1 1 1 1 1 1 . 1 . . 1 1 1 . . 1 1 . 1 1 1 1 1 1 1 ...... +#> [10,] 1 1 1 1 1 1 1 1 . 1 1 1 1 . . . 1 1 . . 1 1 1 . 1 1 1 1 . 1 1 1 . ...... +#> [11,] 1 1 1 1 . . . 1 . 1 . 1 1 1 1 1 1 1 1 . 1 1 1 1 1 1 1 1 1 . 1 . . ...... +#> [12,] 1 1 1 1 1 1 1 . . 1 1 1 1 1 1 1 1 . 1 . 1 1 . . 1 1 1 1 . 1 1 1 . ...... +#> [13,] 1 1 1 1 1 1 1 . 1 1 . 1 . 1 1 . 1 1 1 1 1 1 . . . 1 . 1 1 1 1 1 1 ...... +#> [14,] 1 1 1 1 1 . 1 1 1 . 1 1 1 1 1 1 . . 1 1 1 1 1 1 . 1 1 1 1 1 . 1 . ...... +#> [15,] 1 1 . 1 1 1 . 1 . 1 1 . 1 . . . 1 . 1 1 1 . 1 . 1 . 1 . 1 1 . 1 1 ...... +#> [16,] 1 1 1 1 . 1 1 . 1 1 . 1 . 1 . 1 1 1 1 . 1 . 1 . . 1 . 1 . 1 1 . . ...... +#> [17,] 1 1 1 1 1 1 1 1 1 1 1 1 . 1 1 . . 1 1 . 1 1 1 1 1 1 . . 1 1 1 1 1 ...... +#> [18,] 1 1 1 1 1 1 1 1 1 1 . 1 1 . 1 . . 1 1 1 1 1 . 1 . 1 . 1 1 1 . 1 1 ...... +#> [19,] 1 1 1 1 . . 1 1 . 1 . 1 1 1 1 1 1 1 1 1 . 1 . 1 . . . 1 1 1 1 . 1 ...... +#> [20,] 1 1 1 1 1 . 1 1 1 . 1 1 1 1 1 1 . 1 1 1 1 1 . . . 1 1 1 1 1 1 . 1 ...... #> -#> .....suppressing 14626 columns in show(); maybe adjust 'options(max.print= *, width = *)' +#> .....suppressing 14907 columns in show(); maybe adjust 'options(max.print= *, width = *)' #> .............................. ``` @@ -128,7 +137,7 @@ For example, the following code: ``` r get_target_additive(results) -#> [1] 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 0 1 0 0 0 +#> [1] 0 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 ``` creates a binary response variable based on the logistic regression