R/tidy_bib_file.R

#' Tidy bibliography file
#'
#' Removes duplicate and unneeded entries from a Bib(La)Tex-file.
#'
#' @param rmd_file Character. One path or a vector of paths to the R Markdown files that use the messy bibliography file.
#' @param messy_bibliography Character. One path or a vector of paths to the messy bibliography file(s).
#' @param file Character. Path and name for the to-be-created tidy bibliography. If \code{NULL} the messy bibliography is replaced.
#' @inheritParams query_bib
#'
#' @export
#'
#' @examples
#' \dontrun{tidy_bib_file(rmd_file = c("introduction.Rmd", "methods.Rmd"), messy_bibliography = "references.bib", file = "tidy_references.bib")}

tidy_bib_file <- function(
  rmd_file
  , messy_bibliography
  , file = NULL
  , encoding = getOption("citr.encoding")
  , betterbiblatex_format = getOption("citr.betterbiblatex_format")
) {
  assert_that(is.character(rmd_file))
  assert_that(is.character(messy_bibliography))
  if(!is.null(file)) {
    assert_that(is.string(file))
  } else {
    file <- messy_bibliography
  }
  assert_that(is.string(encoding))
  assert_that(length(encoding) == 1)
  assert_that(is.string(betterbiblatex_format))
  if(!betterbiblatex_format %in% c("bibtex", "biblatex")) {
    stop("Bibliography format not supported. Use either 'bibtex'  or 'biblatex'.")
  }

  rmd <- c()
  for(i in seq_along(rmd_file)) {
    rmd <- paste(c(rmd, readLines(rmd_file[i], encoding = encoding, warn = FALSE)), collapse = " ")
  }

  if(nchar(rmd) == 0){
    stop("The R Markdown file contains no text.")
  }

 rmd_text <- prep_text(rmd)

  # reference_handles <- unlist(regmatches(rmd_text, gregexpr("@[^;,\\s\\]\\)]+", rmd_text, useBytes = TRUE, perl = TRUE)))
  reference_handles <- unlist(regmatches(rmd_text, gregexpr("@[^\\]\\{\\}\\(\\)'=,\\s(---)((.|;)\\s)]+", rmd_text, useBytes = TRUE, perl = TRUE)))
  # reference_handles <- stringi::stri_extract_all(rmd_text, regex = "@[^;\\s\\],]+")[[1]]
  reference_handles <- gsub("@", "", unique(reference_handles), useBytes = TRUE)

  if(length(reference_handles) == 0) stop("Found no references in ", rmd_file)

  complete_bibliography <- c()
  for(i in seq_along(messy_bibliography)) {
    complete_bibliography <- append(complete_bibliography, RefManageR::ReadBib(messy_bibliography[i], check = FALSE, .Encoding = encoding))
  }

  necessary_bibliography <- complete_bibliography[names(complete_bibliography) %in% reference_handles]

  if(length(necessary_bibliography) == 0) stop("Found none of the ", length(reference_handles), " necessary references in the look-up bibliography.")

  if(length(necessary_bibliography) < length(reference_handles)) warning("Only found ", length(necessary_bibliography), " out of ", length(reference_handles), " necessary references in the look-up bibliography. The following references could not be found:\n  ", paste(reference_handles[!reference_handles %in% names(complete_bibliography)], collapse = ", "))

  message("Removing ", length(complete_bibliography) - length(unique(necessary_bibliography)), " unneeded bibliography entries.")

  bib_options <- RefManageR::BibOptions()
  RefManageR::BibOptions(check.entries = FALSE)
  on.exit(RefManageR::BibOptions(bib_options))

  RefManageR::WriteBib(unique(necessary_bibliography), file = file, useBytes = TRUE, biblatex = betterbiblatex_format == "biblatex")
}


# Adapted from wordcountaddin (https://github.com/benmarwick/wordcountaddin/blob/master/R/hello.R)

prep_text <- function(text){

  # remove all line breaks, http://stackoverflow.com/a/21781150/1036500
  text <- gsub("[\r\n]", " ", text)

  # don't include front yaml
  # text <- gsub("---.+?---", "", text, useBytes = TRUE)

  # don't include text in code chunks: https://regex101.com/#python
  text <- gsub("```\\{.+?\\}.+?```", "", text, useBytes = TRUE)

  # don't include text in in-line R code
  text <- gsub("`r.+?`", "", text, useBytes = TRUE)

  # don't include HTML comments
  # text <- gsub("<!--.+?-->", "", text, useBytes = TRUE)

  # don't include inline markdown URLs
  text <- gsub("\\(http.+?\\)", "", text, useBytes = TRUE)

  # don't include images with captions
  # text <- gsub("!\\[.+?\\)", "", text, useBytes = TRUE)

  # don't include html tags
  text <- gsub("<.+?>|</.+?>", "", text)

  # don't include percent signs because they trip up stringi
  text <- gsub("%", "", text)

  # remove cross-references
  text <- gsub("\\\\@ref\\(.+?\\)", "", text, useBytes = TRUE)

  # remove e-mail addresses (http://emailregex.com/)
  text <- gsub("[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,6}", "", text, useBytes = TRUE)

  text
}