-
Notifications
You must be signed in to change notification settings - Fork 6
/
cancerRehab_rCode_all.txt
99 lines (97 loc) · 4.45 KB
/
cancerRehab_rCode_all.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
setwd("H:/Bibliometrics/Clinical center/Cancer rehab")
source("H:/Bibliometrics/R/Functions/pubmedXML.R")
theData <- extract_xml("cancer_rehab.xml")
write.csv(theData, file = "cancerRehab_parsed.csv", row.names = FALSE)
years <- sapply(split(theData, theData$year), nrow)
write.csv(years, file = "pubsPerYear.csv")
library(plyr)
ptypes <- count(unlist(strsplit(theData$ptype, "|", fixed = TRUE)))
ptypes <- ptypes[order(ptypes$freq),]
write.csv(ptypes, file = "pubsPerPubType.csv")
mesh <- count(unlist(strsplit(theData$meshHeadings, "|", fixed = TRUE)))
mesh <- mesh[order(mesh$freq),]
write.csv(mesh, file = "pubsPerMeshHeading.csv", row.names = FALSE)
funders <- count(unlist(strsplit(theData$grantAgency, "|", fixed = TRUE)))
funders <- funders[order(funders$freq),]
write.csv(funders, file = "pubsPerFunder.csv", row.names = FALSE)
tail(funders, 20)
meshYrs <- theData[, c(4, 11)]
library(splitstackshape)
meshYrs <- cSplit(meshYrs, "meshHeadings", "|", direction = "long")
meshYrs <- dcast(meshYrs, meshHeadings ~ year)
meshYrs$total <- rowSums(meshYrs[,2:29])
meshYrs$percChange <- (meshYrs[,26] - meshYrs[,3]) / meshYrs[,3]
write.csv(meshYrs, file = "pubsPerMeshPerYr.csv", row.names = FALSE)
journals <- sapply(split(theData, theData$journal), nrow)
journals <- sort(journals)
write.csv(journals, file = "pubsPerJournal.csv")
tail(journals, 10)
fcntry <- count(unlist(strsplit(theData$grantCountry, "|", fixed = TRUE)))
fcntry <- fcntry[order(fcntry$freq),]
fcntry
write.csv(fcntry, file = "pubsPerGrantCountry.csv", row.names = FALSE)
setwd("H:/Bibliometrics/Clinical center/Cancer rehab")
theDF <- read.csv("cancerByTherapy.csv")
row.names(theDF) <- theDF$Term
theMtx <- as.matrix(theDF[,2:13])
library(RColorBrewer)
heatmap1 <- heatmap(theMtx, Rowv = NA, Colv = NA, col = brewer.pal(11, "RdYlBu"), scale = "none")
setwd("C:/Users/IAB/Documents/Rehab")
library(tm)
library(slam)
theData <- read.csv("cancerRehab_parsed.csv", stringsAsFactors = FALSE)
myStopwords <- scan("stopwords.txt", what = "varchar", skip = 1)
abstracts <- data.frame(theData$abstract)
pmids <- as.vector(theData$pmid)
corpDocs <- Corpus(DataframeSource(abstracts))
corpDocs <- tm_map(corpDocs, removePunctuation)
corpDocs <- tm_map(corpDocs, content_transformer(tolower))
corpDocs <- tm_map(corpDocs, removeWords, stopwords("English"))
corpDocs <- tm_map(corpDocs, removeWords, myStopwords)
corpDocs <- tm_map(corpDocs, stemDocument)
corpDocs <- tm_map(corpDocs, stripWhitespace)
dtm <- DocumentTermMatrix(corpDocs)
rownames(dtm) <- pmids
dtm <- dtm[row_sums(dtm) > 0,]
dtm
dtm <- removeSparseTerms(dtm, 0.995)
dtm
library(topicmodels)
seed <- list(1379, 6513, 10719, 16007, 20991)
ldaOut <- LDA(dtm, 50, method = "Gibbs", control = list(nstart = 5, seed = seed, best = TRUE, burnin = 4000, iter = 2000, thin = 500))
primaryTopics <- as.matrix(topics(ldaOut))
write.csv(primaryTopics, file = "docsToTopics.csv")
topicTerms <- as.matrix(terms(ldaOut, 15))
write.csv(topicTerms, file = "topicsToTerms.csv")
topicProbs <- as.data.frame(ldaOut@gamma)
write.csv(topicProbs, file = "topicProbs.csv")
topicLists <- topics(ldaOut, threshold = 0.08)
topicLists <- sapply(topicLists, paste0, collapse = "|")
newData <- merge(theData, primaryTopics, by.x = "pmid", by.y = "row.names", all.x = TRUE)
newData <- merge(newData, topicLists, by.x = "pmid", by.y = "row.names", all.x = TRUE)
write.csv(newData, file = "clusteredData.csv", row.names = FALSE)
library(igraph)
edges <- as.matrix(topics(ldaOut, 2))
edges <- as.data.frame(t(edges))
nCounts <- sapply(split(edges, edges$V1), nrow)
nodes <- data.frame(names(nCounts), nCounts)
theGraph <- graph_from_data_frame(edges, directed = FALSE, vertices = nodes)
write_graph(theGraph, file = "topicNetwork.graphml", format = "graphml")
write.csv(nodes, file = "nodeList.csv")
library(plyr)
library(reshape2)
tyears <- count(newData, vars = c("V1", "year"))
tyears <- dcast(tyears, V1 ~ year, value.var = "freq")
head(tyears)
tyears$total <- rowSums(tyears[,2:29], na.rm = TRUE)
tyears$percChng <- (tyears[,27] - tyears[,3]) / tyears[,3]
write.csv(tyears, file = "pubsPerTopicPerYear.csv", row.names = FALSE)
library(splitstackshape)
tfund <- newData[,c(12, 16)]
tfund <- cSplit(tfund, "grantAgency", "|", direction = "long")
tfund <- dcast(tfund, grantAgency ~ V1)
head(tfund)
tfund$total <- rowSums(tfund[,2:52])
tfund <- tfund[order(tfund$total),]
tail(tfund, 15)
write.csv(tfund, file = "pubsPerTopicPerFunder.csv", row.names = FALSE)