-
Notifications
You must be signed in to change notification settings - Fork 46
/
tidy_bib_file.R
113 lines (85 loc) · 4.48 KB
/
tidy_bib_file.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#' Tidy bibliography file
#'
#' Removes duplicate and unneeded entries from a Bib(La)Tex-file.
#'
#' @param rmd_file Character. One path or a vector of paths to the R Markdown files that use the messy bibliography file.
#' @param messy_bibliography Character. One path or a vector of paths to the messy bibliography file(s).
#' @param file Character. Path and name for the to-be-created tidy bibliography. If \code{NULL} the messy bibliography is replaced.
#' @inheritParams query_bib
#'
#' @export
#'
#' @examples
#' \dontrun{tidy_bib_file(rmd_file = c("introduction.Rmd", "methods.Rmd"), messy_bibliography = "references.bib", file = "tidy_references.bib")}
tidy_bib_file <- function(
rmd_file
, messy_bibliography
, file = NULL
, encoding = getOption("citr.encoding")
, betterbiblatex_format = getOption("citr.betterbiblatex_format")
) {
assert_that(is.character(rmd_file))
assert_that(is.character(messy_bibliography))
if(!is.null(file)) {
assert_that(is.string(file))
} else {
file <- messy_bibliography
}
assert_that(is.string(encoding))
assert_that(length(encoding) == 1)
assert_that(is.string(betterbiblatex_format))
if(!betterbiblatex_format %in% c("bibtex", "biblatex")) {
stop("Bibliography format not supported. Use either 'bibtex' or 'biblatex'.")
}
rmd <- c()
for(i in seq_along(rmd_file)) {
rmd <- paste(c(rmd, readLines(rmd_file[i], encoding = encoding, warn = FALSE)), collapse = " ")
}
if(nchar(rmd) == 0){
stop("The R Markdown file contains no text.")
}
rmd_text <- prep_text(rmd)
# reference_handles <- unlist(regmatches(rmd_text, gregexpr("@[^;,\\s\\]\\)]+", rmd_text, useBytes = TRUE, perl = TRUE)))
reference_handles <- unlist(regmatches(rmd_text, gregexpr("@[^\\]\\{\\}\\(\\)'=,\\s(---)((.|;)\\s)]+", rmd_text, useBytes = TRUE, perl = TRUE)))
# reference_handles <- stringi::stri_extract_all(rmd_text, regex = "@[^;\\s\\],]+")[[1]]
reference_handles <- gsub("@", "", unique(reference_handles), useBytes = TRUE)
if(length(reference_handles) == 0) stop("Found no references in ", rmd_file)
complete_bibliography <- c()
for(i in seq_along(messy_bibliography)) {
complete_bibliography <- append(complete_bibliography, RefManageR::ReadBib(messy_bibliography[i], check = FALSE, .Encoding = encoding))
}
necessary_bibliography <- complete_bibliography[names(complete_bibliography) %in% reference_handles]
if(length(necessary_bibliography) == 0) stop("Found none of the ", length(reference_handles), " necessary references in the look-up bibliography.")
if(length(necessary_bibliography) < length(reference_handles)) warning("Only found ", length(necessary_bibliography), " out of ", length(reference_handles), " necessary references in the look-up bibliography. The following references could not be found:\n ", paste(reference_handles[!reference_handles %in% names(complete_bibliography)], collapse = ", "))
message("Removing ", length(complete_bibliography) - length(unique(necessary_bibliography)), " unneeded bibliography entries.")
bib_options <- RefManageR::BibOptions()
RefManageR::BibOptions(check.entries = FALSE)
on.exit(RefManageR::BibOptions(bib_options))
RefManageR::WriteBib(unique(necessary_bibliography), file = file, useBytes = TRUE, biblatex = betterbiblatex_format == "biblatex")
}
# Adapted from wordcountaddin (https://github.com/benmarwick/wordcountaddin/blob/master/R/hello.R)
prep_text <- function(text){
# remove all line breaks, http://stackoverflow.com/a/21781150/1036500
text <- gsub("[\r\n]", " ", text)
# don't include front yaml
# text <- gsub("---.+?---", "", text, useBytes = TRUE)
# don't include text in code chunks: https://regex101.com/#python
text <- gsub("```\\{.+?\\}.+?```", "", text, useBytes = TRUE)
# don't include text in in-line R code
text <- gsub("`r.+?`", "", text, useBytes = TRUE)
# don't include HTML comments
# text <- gsub("<!--.+?-->", "", text, useBytes = TRUE)
# don't include inline markdown URLs
text <- gsub("\\(http.+?\\)", "", text, useBytes = TRUE)
# don't include images with captions
# text <- gsub("!\\[.+?\\)", "", text, useBytes = TRUE)
# don't include html tags
text <- gsub("<.+?>|</.+?>", "", text)
# don't include percent signs because they trip up stringi
text <- gsub("%", "", text)
# remove cross-references
text <- gsub("\\\\@ref\\(.+?\\)", "", text, useBytes = TRUE)
# remove e-mail addresses (http://emailregex.com/)
text <- gsub("[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,6}", "", text, useBytes = TRUE)
text
}