diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 44daa85e..7191d35d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -129,7 +129,7 @@ The current build status is as follows: * Code must be reviewed by one other project member and, if needed, be properly adapted/fixed. * We add the `Reviewed-by` tag only for the merge commit. -There will be another checklist for you when you open an actual pull request provided by [the corresponding template](.github/PULL_REQUEST_TEMPLATE/pull-request.md). +There will be another checklist for you when you open an actual pull request provided by [the corresponding template](.github/PULL_REQUEST_TEMPLATE.md). ## Style Conventions diff --git a/NEWS.md b/NEWS.md index b1bfc74b..5ae28b67 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,12 +6,18 @@ - Add a new file `util-tensor.R` containing the class `FourthOrderTensor` to create (author x relation x author x relation) tensors from a list of networks (with each network having a different relation) and its corresponding utility function `get.author.networks.for.multiple.relations` (PR #173, c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add function `calculate.EDCPTD.centrality` for calculating the EDCPTD centrality for a fourth-order tensor in the above described form (c136b1f6127d73c25f08ae2f317246747aa9ea2b, e4ee0dc926b22ff75d5fd801c1f131bcff4c22eb, 051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Add new file `util-networks-misc.R` which contains miscellaneous functions for processing network data and creating and converting various kinds of adjacency matrices: `get.author.names.from.networks`, `get.author.names.from.data`, `get.expanded.adjacency`, `get.expanded.adjacency.matrices`, `get.expanded.adjacency.matrices.cumulated`, `convert.adjacency.matrix.list.to.array` (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) +- Add tests for sliding-window functionality and make parameterized tests possible (a3ad0a81015c7f23bce958d5c1922e3b82b28bda, 2ed84ac55d434f62341297b1aa9676c12e383491, PR #184) ### Changed/Improved - Adjust the function `get.authors.by.data.source`: Rename its single parameter to `data.sources` and change the function so that it can extract the authors for multiple data sources at once. The default value of the parameter is a vector containing all the available data sources (commits, mails, issues) (051a5f0287022f97e2367ed0e9591b9df9dbdb3d) - Adjust recommended R version to 3.6.3 in README (92be262514277acb774ab2885c1c0d1c10f03373) - Add R version 4.0 to test suite and adjust package installation in `install.R` to improve compatibility with Travis CI (40aa0d80e2a94434a8be75925dbefbde6d3518b2, 1ba036758a63767e2fcef525c98f5a4fd6938c39, #161) +### Fixed +- Fix sliding-window creation in various splitting functions (`split.network.time.based`, `split.networks.time.based`, `split.data.time.based`, `split.data.activity.based`, `split.network.activity.based`) and also fix the computation of overlapping ranges in the function `construct.overlapping.ranges` to make sure that the last and the second-last range do not cover the same range) (1abc1b8dbfc65ccad0cbbc8e33b209e39d2f8118, c34c42aef32a30b82adc53384fd6a1b09fc75dee, 097cebcc477b1b65056d512124575f5a78229c3e, 9a1b6516f490b72b821be2d5365d98cac1907b2f, 0fc179e2735bec37d26a68c6c351ab43770007d2, cad28bf221f942eb25e997aaa2de553181956680, 7602af2cf46f699b2285d53819dec614c71754c6, PR #184) +- Fix off-by-1 error in the function `get.data.cut.to.same.date` (f0744c0e14543292cccb1aa9a61f822755ee7183) +- Fix missing or wrongly set layout when plotting networks (#186, 720cc7ba7bdb635129c7669911aef8e7c6200a6b, 877931b94f87ca097c2f8f3c55e4b4bcc6087742) + ## 3.6 diff --git a/README.md b/README.md index d09d62fe..70e04636 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,7 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `logging`: Logging - `sqldf`: For advanced aggregation of `data.frame` objects - `testthat`: For the test suite +- `patrick`: For the test suite - `ggplot2`: For plotting of data - `ggraph`: For plotting of networks (needs `udunits2` system library, e.g., `libudunits2-dev` on Ubuntu!) - `markovchain`: For core/peripheral transition probabilities diff --git a/install.R b/install.R index 132879d9..d796d8d0 100644 --- a/install.R +++ b/install.R @@ -33,6 +33,7 @@ packages = c( "logging", "sqldf", "testthat", + "patrick", "ggplot2", "ggraph", "markovchain", diff --git a/tests.R b/tests.R index 58724171..89981076 100644 --- a/tests.R +++ b/tests.R @@ -12,6 +12,7 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2017, 2019 by Claus Hunsen +## Copyright 2020 by Thomas Bock ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -42,8 +43,9 @@ sessionInfo() logging::loginfo("Running test suite.") -## load package 'testthat' +## load packages 'testthat' and 'patrick' requireNamespace("testthat") +requireNamespace("patrick") ## starting tests do.tests = function(dir) { diff --git a/tests/test-data-cut.R b/tests/test-data-cut.R index 6d85d18b..3234b786 100644 --- a/tests/test-data-cut.R +++ b/tests/test-data-cut.R @@ -16,6 +16,7 @@ ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. @@ -62,14 +63,14 @@ test_that("Cut commit and mail data to same date range.", { artifact.type = c("Feature", "Feature"), artifact.diff.size = as.integer(c(1, 1))) - mail.data.expected = data.frame(author.name = c("Thomas"), - author.email = c("thomas@example.org"), - message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>"), - date = get.date.from.string("2016-07-12 16:04:40"), - date.offset = as.integer(c(100)), - subject = c("Re: Fw: busybox 2 tab"), - thread = sprintf("", c(9)), - artifact.type = "Mail") + mail.data.expected = data.frame(author.name = c("Thomas", "Olaf"), + author.email = c("thomas@example.org", "olaf@example.org"), + message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>", "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>"), + date = get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37")), + date.offset = as.integer(c(100, 200)), + subject = c("Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), + thread = sprintf("", c(9, 9)), + artifact.type = c("Mail", "Mail")) commit.data = x.data$get.data.cut.to.same.date(data.sources = data.sources)$get.commits() rownames(commit.data) = 1:nrow(commit.data) diff --git a/tests/test-networks-cut.R b/tests/test-networks-cut.R index 12716572..95f7a891 100644 --- a/tests/test-networks-cut.R +++ b/tests/test-networks-cut.R @@ -14,6 +14,7 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## All Rights Reserved. @@ -62,14 +63,14 @@ test_that("Cut commit and mail data to same date range.", { artifact.type = c("Feature", "Feature"), artifact.diff.size = as.integer(c(1, 1))) - mail.data.expected = data.frame(author.name = c("Thomas"), - author.email = c("thomas@example.org"), - message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>"), - date = get.date.from.string(c("2016-07-12 16:04:40")), - date.offset = as.integer(c(100)), - subject = c("Re: Fw: busybox 2 tab"), - thread = sprintf("", c(9)), - artifact.type = "Mail") + mail.data.expected = data.frame(author.name = c("Thomas", "Olaf"), + author.email = c("thomas@example.org", "olaf@example.org"), + message.id = c("<65a1sf31sagd684dfv31@mail.gmail.com>", "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>"), + date = get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37")), + date.offset = as.integer(c(100, 200)), + subject = c("Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), + thread = sprintf("", c(9, 9)), + artifact.type = c("Mail", "Mail")) commit.data = x$get.project.data()$get.commits() rownames(commit.data) = 1:nrow(commit.data) diff --git a/tests/test-networks-equal-constructions.R b/tests/test-networks-equal-constructions.R index fc3e0a09..9f254b9a 100644 --- a/tests/test-networks-equal-constructions.R +++ b/tests/test-networks-equal-constructions.R @@ -13,6 +13,7 @@ ## ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Claus Hunsen +## Copyright 2020 by Thomas Bock ## All Rights Reserved. @@ -86,7 +87,8 @@ compare.edge.and.vertex.lists = function(split.author.networks.one = NULL, split } } -test_that("Compare the bipartite and author network constructed in two ways with author/artifact relation 'cochange'", { +patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways + with author/artifact relation 'cochange', ", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -106,7 +108,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## split the networks split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = FALSE) + time.period = splitting.period, sliding.window = test.sliding.window) ## separate the author and bipartite networks split.author.networks.one = split.networks[[1]] @@ -116,7 +118,8 @@ test_that("Compare the bipartite and author network constructed in two ways with multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period) + multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, + sliding.window = test.sliding.window) split.author.networks.two = list() split.bipartite.networks.two = list() @@ -134,10 +137,13 @@ test_that("Compare the bipartite and author network constructed in two ways with ## created with different approaches compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, split.bipartite.networks.one, split.bipartite.networks.two) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) -test_that("Compare the bipartite and author network constructed in two ways with author relation 'mail' and artifact relation - 'cochange'", { +patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways + with author relation 'mail' and artifact relation 'cochange', ", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -158,7 +164,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## split the networks split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = FALSE) + time.period = splitting.period, sliding.window = test.sliding.window) ## separate the author and bipartite networks split.author.networks.one = split.networks[[1]] @@ -168,7 +174,8 @@ test_that("Compare the bipartite and author network constructed in two ways with multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period) + multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, + sliding.window = test.sliding.window) split.author.networks.two = list() split.bipartite.networks.two = list() @@ -187,9 +194,13 @@ test_that("Compare the bipartite and author network constructed in two ways with ## created with different approaches compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, split.bipartite.networks.one, split.bipartite.networks.two) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) -test_that("Compare the bipartite and author network constructed in two ways with author and artifact relation 'mail'", { +patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways + with author and artifact relation 'mail', ", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -210,7 +221,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## split the networks split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = FALSE) + time.period = splitting.period, sliding.window = test.sliding.window) ## separate the author and bipartite networks split.author.networks.one = split.networks[[1]] @@ -220,7 +231,8 @@ test_that("Compare the bipartite and author network constructed in two ways with multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period) + multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, + sliding.window = test.sliding.window) split.author.networks.two = list() split.bipartite.networks.two = list() @@ -239,4 +251,7 @@ test_that("Compare the bipartite and author network constructed in two ways with ## created with different approaches compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, split.bipartite.networks.one, split.bipartite.networks.two) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) diff --git a/tests/test-split-sliding-window.R b/tests/test-split-sliding-window.R new file mode 100644 index 00000000..ea9d712e --- /dev/null +++ b/tests/test-split-sliding-window.R @@ -0,0 +1,1287 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2017-2019 by Claus Hunsen +## Copyright 2017 by Felix Prasse +## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock +## Copyright 2018 by Christian Hechtl +## Copyright 2018 by Jakob Kronawitter +## Copyright 2019 by Anselm Fehnker +## All Rights Reserved. + + +context("Splitting functionality, using sliding windows.") + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + + +## +## NOTE +## + +## In this test file, we rather test the raw data contents of the data objects +## instead of the networks that can be constructed from these data items! + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split data -------------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + +## +## Tests for split.data.time.based(..., split.basis = 'commits'), using sliding windows +## + +test_that("Split a data object time-based (split.basis = 'commits', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 min", + split.basis = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:01:59", + "2016-07-12 16:00:29-2016-07-12 16:03:29", + "2016-07-12 16:01:59-2016-07-12 16:04:59", + "2016-07-12 16:03:29-2016-07-12 16:06:29", + "2016-07-12 16:04:59-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:2, ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$commits[2, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commits[3:5, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[rownames(data$mails) %in% c(16, 17), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22), ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(14:15), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15, 29), ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) == 29, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$synchronicity, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$synchronicity, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$pasta, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$pasta, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + + +## +## Tests for split.data.time.based(..., split.basis = 'mails'), using sliding windows +## + +test_that("Split a data object time-based (split.basis = 'mails', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 years", + split.basis = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2007-10-10 12:38:13", + "2006-04-10 15:38:13-2009-04-10 09:38:13", + "2007-10-10 12:38:13-2010-10-10 06:38:13", + "2009-04-10 09:38:13-2012-04-10 03:38:13", + "2010-10-10 06:38:13-2013-10-10 00:38:13", + "2012-04-10 03:38:13-2015-04-10 21:38:13", + "2013-10-10 00:38:13-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commits[0, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$commits[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commits[0, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$commits[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commits[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] + ), + issues = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$issues[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$issues[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$synchronicity, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$synchronicity, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$synchronicity, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$pasta, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$pasta, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$pasta, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + + +## +## Tests for split.data.time.based(..., split.basis = 'issues'), using sliding windows +## + +test_that("Split a data object time-based (split.basis = 'issues', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "2 years", + split.basis = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2015-04-22 11:52:09", + "2014-04-22 05:52:09-2016-04-21 17:52:09", + "2015-04-22 11:52:09-2017-04-21 23:52:09", + "2016-04-21 17:52:09-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commits[0, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$commits[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commits + ), + mails = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + ), + issues = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$issues[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% 14:34, ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:36, ] + ), + synchronicity = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$synchronicity, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$pasta, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + +## * * bins ---------------------------------------------------------------- + +## +## Tests for split.data.time.based(..., bins = ...), sliding windows parameter ignored +## + +test_that("Split a data object time-based (bins = ... , sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.time.based(project.data, bins = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", + "2017-06-03 03:03:03"), + split.basis = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-01-01 00:00:00-2016-12-31 23:59:59", + "2016-12-31 23:59:59-2017-06-03 03:03:03" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commits[0, ] + ), + mails = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$mails[0, ] + ), + issues = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% 14:34, ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% 35:36, ] + ), + synchronicity = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$synchronicity + ), + pasta = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}) + +## * activity-based -------------------------------------------------------- + +## +## Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows +## + +test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:10", + "2016-07-12 16:00:45-2016-07-12 16:06:20", + "2016-07-12 16:06:10-2016-07-12 16:06:32", + "2016-07-12 16:06:20-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (activity.amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") + + ## + ## split by too-large activity amount + ## + + ## split data + results = split.data.activity.based(project.data, activity.amount = nrow(data$commits) + 10, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (too-large activity amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29), ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") + + ## + ## split by number of windows (i.e., ignoring sliding windows) + ## + + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:20", + "2016-07-12 16:06:20-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (number.windows).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commits[1:4, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") + + ## too large number of windows (i.e., ignoring sliding windows) + + expect_error( + split.data.activity.based(project.data, activity.type = "commits", + number.windows = nrow(project.data$get.commits()) + 10, sliding.window = TRUE), + info = "Error expected (number.windows) (1)." + ) + + expect_error( + split.data.activity.based(project.data, activity.type = "commits", number.windows = 0, sliding.window = TRUE), + info = "Error expected (number.windows) (2)." + ) + +}) + +test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE), continued.", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + + ## add one commit to the commit data having same date as latest commit + commit.data = project.data$get.commits() + latest.commit = commit.data[nrow(commit.data), ] + latest.commit[1, "commit.id"] = "" + latest.commit[1, "hash"] = "abcdefghijklmnopqrstuvxyz" + commit.data = rbind(commit.data, latest.commit) + project.data$set.commits(commit.data) + + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:10", + "2016-07-12 16:00:45-2016-07-12 16:06:20", + "2016-07-12 16:06:10-2016-07-12 16:06:32", + "2016-07-12 16:06:20-2016-07-12 16:06:33", + "2016-07-12 16:06:32-2016-07-12 16:06:33" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (activity.amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:9, ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") + +}) + + +## +## Tests for split.data.activity.based(..., activity.type = 'mails') using sliding windows +## + +test_that("Split a data object activity-based (activity.type = 'mails', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2010-07-12 11:05:35", + "2005-02-09 18:49:49-2010-07-12 12:05:34", + "2010-07-12 11:05:35-2010-07-12 12:05:41", + "2010-07-12 12:05:34-2010-07-12 12:05:42", + "2010-07-12 12:05:41-2010-07-12 12:05:44", + "2010-07-12 12:05:42-2010-07-12 12:05:45", + "2010-07-12 12:05:44-2016-07-12 15:58:40", + "2010-07-12 12:05:45-2016-07-12 15:58:50", + "2016-07-12 15:58:40-2016-07-12 16:05:37", + "2016-07-12 15:58:50-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commits[0, ], + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$commits[0, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commits[0, ], + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$commits[0, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commits[0, ], + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$commits[0, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commits[0, ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$commits[0, ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$mails[rownames(data$mails) %in% 2:4, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$mails[rownames(data$mails) %in% 5:7, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$mails[rownames(data$mails) %in% 8:10, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[rownames(data$mails) %in% c(11:12, 14), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 15:17, ] + ), + issues = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$issues[0, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$issues[0, ], + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$issues[0, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$issues[0, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28), ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 27:28), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$synchronicity, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$synchronicity, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$synchronicity, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$synchronicity, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$pasta, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$pasta, + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$pasta, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$pasta, + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$pasta, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$pasta, + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$pasta, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + + ## + ## split by too-large activity amount + ## + + ## split data + results = split.data.activity.based(project.data, activity.amount = nrow(data$mails) + 10, + activity.type = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (too-large activity amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails + ), + issues = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") + + ## + ## split by number of windows (i.e., ignoring sliding windows) + ## + + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "mail", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2010-07-12 12:05:43", + "2010-07-12 12:05:43-2016-07-12 16:05:38" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (number.windows).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + mails = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] + ), + issues = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29), ] + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity + ), + pasta = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$pasta, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") + + ## too large number of windows (i.e., ignoring sliding windows) + + expect_error( + split.data.activity.based(project.data, activity.type = "mails", + number.windows = nrow(project.data$get.mails()) + 10, sliding.window = TRUE), + info = "Error expected (number.windows) (1)." + ) + + expect_error( + split.data.activity.based(project.data, activity.type = "mails", number.windows = 0, sliding.window = TRUE), + info = "Error expected (number.windows) (2)." + ) +}) + + +## +## Tests for split.data.activity.based(..., activity.type = 'issues') using sliding windows +## + +test_that("Split a data object activity-based (activity.type = 'issues', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits(), + mails = project.data$get.mails(), + issues = project.data$get.issues(), + synchronicity = project.data$get.synchronicity(), + pasta = project.data$get.pasta() + ) + + ## split data + results = split.data.activity.based(project.data, activity.amount = 9, + activity.type = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2013-05-25 06:22:23", + "2013-05-06 01:04:34-2016-07-12 15:59:25", + "2013-05-25 06:22:23-2016-07-12 16:03:59", + "2016-07-12 15:59:25-2016-07-27 20:12:08", + "2016-07-12 16:03:59-2016-10-05 15:30:02", + "2016-07-27 20:12:08-2017-05-23 12:31:34", + "2016-10-05 15:30:02-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commits[0, ], + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$commits[1, ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$commits[1:2, ], + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$commits[2:8, ], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$commits[3:8, ], + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commits[0, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] + ), + mails = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$mails[rownames(data$mails) %in% 14:15, ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$mails[rownames(data$mails) %in% 16:17, ], + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$mails[0, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + ), + issues = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$issues[rownames(data$issues) %in% c(6:13, 27:28), ], + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$issues[rownames(data$issues) %in% c(11:15, 20:22, 27:28), ], + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(14:17, 20:23, 29),], + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 29:30), ], + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 30:34),], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] + ), + synchronicity = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$synchronicity, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$synchronicity, + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$synchronicity, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$synchronicity, + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$synchronicity, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, + "2013-05-06 01:04:34-2016-07-12 15:59:25" = data$pasta, + "2013-05-25 06:22:23-2016-07-12 16:03:59" = data$pasta, + "2016-07-12 15:59:25-2016-07-27 20:12:08" = data$pasta, + "2016-07-12 16:03:59-2016-10-05 15:30:02" = data$pasta, + "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$pasta, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges.") + + ## + ## split by too-large activity amount + ## + + ## split data + results = split.data.activity.based(project.data, activity.amount = nrow(data$issues) + 10, + activity.type = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (too-large activity amount).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commits + ), + mails = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + ), + issues = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$issues + ), + synchronicity = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") + + ## + ## split by number of windows (i.e., ignoring sliding windows) + ## + + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2016-07-12 16:02:30", + "2016-07-12 16:02:30-2017-05-23 12:32:40" + ) + result = proj.conf$get.value("ranges") + expect_equal(result, expected, info = "Time ranges (number.windows).") + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$commits[1:2, ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$commits[3:8, ] + ), + mails = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$mails[rownames(data$mails) %in% 14:15, ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] + ), + issues = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28), ], + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36), ] + ), + synchronicity = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$synchronicity, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$synchronicity + ), + pasta = list( + "2013-04-21 23:52:09-2016-07-12 16:02:30" = data$pasta, + "2016-07-12 16:02:30-2017-05-23 12:32:40" = data$pasta + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()) + ) + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") + + ## too large number of windows (i.e., ignoring sliding windows) + + expect_error( + split.data.activity.based(project.data, activity.type = "issues", + number.windows = nrow(project.data$get.issues()) + 10, sliding.window = TRUE), + info = "Error expected (number.windows) (1)." + ) + + expect_error( + split.data.activity.based(project.data, activity.type = "issues", number.windows = 0, sliding.window = TRUE), + info = "Error expected (number.windows) (2)." + ) +}) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split network ----------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + +## +## Tests for split.network.time.based(..., time.period = ...) using sliding windows +## + +test_that("Split a network time-based (time.period = ... , sliding.window = TRUE).", { + + ## time period + time.period = "2 mins" + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## + ## simplify = FALSE + ## + + ## retrieve author network + author.net = net.builder$get.author.network() + + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), + "2016-07-12 15:59:59-2016-07-12 16:01:59" = igraph::subgraph.edges(author.net, c(2)), + "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:01:59-2016-07-12 16:03:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:03:59-2016-07-12 16:05:59" = igraph::subgraph.edges(author.net, c(3,5)), + "2016-07-12 16:04:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(3:8)) + ) + results = split.network.time.based(author.net, time.period = "2 mins", sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges.") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality.") + + ## + ## simplify = TRUE + ## + + ## update network configuration + net.builder$update.network.conf(list(author.relation = "cochange", simplify = TRUE)) + net.builder$reset.environment() + + ## retrieve author network + author.net = net.builder$get.author.network() + + expect_error(split.network.time.based(author.net, bins = bins, sliding.window = TRUE), info = "Illegal split.") + +}) + +## * activity-based ------------------------------------------------------------ + +## +## Tests for split.network.activity.based(...) using sliding windows +## + +test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE).", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network + author.net = net.builder$get.author.network() + + ## + ## number.edges (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), + "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), + "2016-07-12 16:06:10-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + + ## + ## number.edges (2) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(1:igraph::ecount(author.net))) + ) + results = split.network.activity.based(author.net, number.edges = igraph::ecount(author.net) + 10, + sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (2)).") + + ## + ## number.windows (1) (i.e., ignoring sliding windows) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 5, 7)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.windows = 3, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.windows (1)).") + + ## + ## number.windows (2) (i.e., ignoring sliding windows) + ## + + expect_error( + split.network.activity.based(author.net, number.windows = igraph::ecount(author.net) + 10, + sliding.window = TRUE), + info = "Error expected (number.windows (2))." + ) + +}) + +test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE), continued.", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network and add an additional edge in the end + author.net = net.builder$get.author.network() + author.net = igraph::add_edges(author.net, c("Olaf", "Thomas"), + attr = list(date = get.date.from.string("2020-02-20 20:20:20"))) + + ## + ## number.edges (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), + "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:32-2020-02-20 20:20:20" = igraph::subgraph.edges(author.net, c(6, 8)), + "2016-07-12 16:06:32-2020-02-20 20:20:21" = igraph::subgraph.edges(author.net, c(8, 9)) + ) + results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + +}) diff --git a/tests/test-split.R b/tests/test-split.R index 652cbecb..b97926f5 100644 --- a/tests/test-split.R +++ b/tests/test-split.R @@ -14,6 +14,7 @@ ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2017 by Felix Prasse ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker @@ -47,7 +48,6 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## TODO ## -## - sliding.window = TRUE ## - net.conf$update.values(list(pasta = TRUE, synchronicity = TRUE)) @@ -62,7 +62,7 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## Tests for split.data.time.based(..., split.basis = 'commits') ## -test_that("Split a data object time-based (split.basis == 'commits').", { +test_that("Split a data object time-based (split.basis = 'commits').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -137,7 +137,7 @@ test_that("Split a data object time-based (split.basis == 'commits').", { ## Tests for split.data.time.based(..., split.basis = 'mails') ## -test_that("Split a data object time-based (split.basis == 'mails').", { +test_that("Split a data object time-based (split.basis = 'mails').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -219,7 +219,7 @@ test_that("Split a data object time-based (split.basis == 'mails').", { ## Tests for split.data.time.based(..., split.basis = 'issues') ## -test_that("Split a data object time-based (split.basis == 'issues').", { +test_that("Split a data object time-based (split.basis = 'issues').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -296,7 +296,7 @@ test_that("Split a data object time-based (split.basis == 'issues').", { ## Tests for split.data.time.based(..., bins = ...) ## -test_that("Split a data object time-based (bins == ... ).", { +test_that("Split a data object time-based (bins = ... ).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -1086,7 +1086,7 @@ test_that("Split a network time-based (time.period = ...).", { ## Tests for split.networks.time.based(..., time.period = ...) ## -test_that("Split a list of networks time-based.", { +patrick::with_parameters_test_that("Split a list of networks time-based, ", { ## time period time.period = "2 years" @@ -1111,7 +1111,7 @@ test_that("Split a list of networks time-based.", { net.split = split.networks.time.based( networks = list(net.cochange, net.mail), time.period = time.period, - sliding.window = FALSE + sliding.window = test.sliding.window ) ## check whether the splitting information of the two split networks are identical @@ -1121,10 +1121,13 @@ test_that("Split a list of networks time-based.", { net.split = split.networks.time.based( networks = list(net.mail), time.period = time.period, - sliding.window = FALSE + sliding.window = test.sliding.window ) -}) +}, patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) +)) ## * * bins ---------------------------------------------------------------- @@ -1132,7 +1135,7 @@ test_that("Split a list of networks time-based.", { ## Tests for split.network.time.based(..., bins = ...) ## -test_that("Split a network time-based (bins = ...).", { +patrick::with_parameters_test_that("Split a network time-based (bins = ...), ", { ## bins bins = c("2016-07-12 15:58:00", "2016-07-12 16:00:59", "2016-07-12 16:02:59", @@ -1160,7 +1163,7 @@ test_that("Split a network time-based (bins = ...).", { "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), "2016-07-12 16:04:59-2016-07-12 17:21:43" = igraph::subgraph.edges(author.net, c(3:8)) ) - results = split.network.time.based(author.net, bins = bins) + results = split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window) ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") @@ -1182,9 +1185,13 @@ test_that("Split a network time-based (bins = ...).", { ## retrieve author network author.net = net.builder$get.author.network() - expect_error(split.network.time.based(author.net, bins = bins), info = "Illegal split.") + expect_error(split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window), + info = "Illegal split.") -}) +}, patrick::cases( + "sliding window (ignored): FALSE" = list(test.sliding.window = FALSE), + "sliding window (ignored): TRUE" = list(test.sliding.window = TRUE) +)) ## * * ranges -------------------------------------------------------------------- diff --git a/util-data.R b/util-data.R index 8694ab35..6b18f8f9 100644 --- a/util-data.R +++ b/util-data.R @@ -13,6 +13,7 @@ ## ## Copyright 2016-2019 by Claus Hunsen ## Copyright 2017-2019 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017 by Felix Prasse @@ -1146,7 +1147,7 @@ ProjectData = R6::R6Class("ProjectData", ## get the timestamp data as vector timestamps.df = self$get.data.timestamps(data.sources = data.sources , simple = TRUE) - timestamps = c(start = timestamps.df[, "start"], end = timestamps.df[, "end"]) + timestamps = c(start = timestamps.df[, "start"], end = timestamps.df[, "end"] + 1) ## check consistency if (timestamps["start"] > timestamps["end"]) { diff --git a/util-misc.R b/util-misc.R index 26002921..0fc632d7 100644 --- a/util-misc.R +++ b/util-misc.R @@ -16,6 +16,7 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2018-2019 by Jakob Kronawitter ## All Rights Reserved. @@ -568,7 +569,10 @@ construct.overlapping.ranges = function(start, end, time.period, overlap, imperf ## compute negative overlap overlap.negative = time.period - overlap ## compute number of complete bins - bins.number = round(bins.duration / overlap.negative) + bins.number = floor(bins.duration / overlap.negative) + if (bins.number < 1) { + bins.number = 1 + } ## generate a approximate sequence of dates which can be streamlined later seq.start = start.date + overlap diff --git a/util-networks-misc.R b/util-networks-misc.R index a98e999f..a9053220 100644 --- a/util-networks-misc.R +++ b/util-networks-misc.R @@ -11,9 +11,10 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2016 by Sofie Kemper -## Copyright 2016 by Claus Hunsen +## Copyright 2016-2017 by Sofie Kemper +## Copyright 2016-2017 by Claus Hunsen ## Copyright 2016-2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## Copyright 2017 by Angelika Schmid ## Copyright 2019 by Jakob Kronawitter ## Copyright 2019-2020 by Anselm Fehnker @@ -224,7 +225,7 @@ convert.adjacency.matrix.list.to.array = function(adjacency.list){ colnames(array) = colnames(adjacency.list[[1]]) ## copy the activity values from the adjacency matrices in the list to the corresponding array slices - for (i in seq_along(adjacency.ist)){ + for (i in seq_along(adjacency.list)){ adjacency = adjacency.list[[i]] activity.indices = which(adjacency != 0, arr.ind = TRUE) diff --git a/util-plot.R b/util-plot.R index 0c009392..fdb60232 100644 --- a/util-plot.R +++ b/util-plot.R @@ -14,6 +14,7 @@ ## Copyright 2017-2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## All Rights Reserved. @@ -46,11 +47,13 @@ PLOT.VERTEX.LABEL.COLOR = "gray60" #' Construct a ggplot2/ggraph plot object for the given network and print it directly. #' -#' As a layout, by default, \code{igraph::layout.kamada.kawai} (also known as \code{igraph::layout_with_kk}) +#' As a layout, by default, the "kk" layout from igraph (also known as "layout_kamada_kawai") is used, #' is used, unless a graph attribute "layout" is set. For a comprehensive list of layouts and more information -#' on layouts in general, see \link{http://igraph.org/r/doc/layout_.html}. +#' on layouts in general, see \link{https://igraph.org/python/doc/tutorial/tutorial.html#layout-algorithms}. #' To set the graph attribute on your network, run the following code while replacing \code{layout.to.set} #' to your liking: \code{network = igraph::set.graph.attribute(network, "layout", layout.to.set)}. +#' Note that \code{layout.to.set} refers to one of the "short names" of the recpective igraph layout, as +#' specified on the Web site in the link given above. #' #' Note: The names for the vertex types are taken from the variables \code{PLOT.VERTEX.TYPE.AUTHOR} and #' \code{PLOT.VERTEX.TYPE.ARTIFACT}. The defaults are \code{"Developer"} and \code{TYPE.ARTIFACT}, respectively. @@ -68,11 +71,13 @@ plot.network = function(network, labels = TRUE) { #' Construct a ggplot2/ggraph plot object for the given network and print it directly. #' -#' As a layout, by default, \code{igraph::layout.kamada.kawai} (also known as \code{igraph::layout_with_kk}) +#' As a layout, by default, the "kk" layout from igraph (also known as "layout_kamada_kawai") is used, #' is used, unless a graph attribute "layout" is set. For a comprehensive list of layouts and more information -#' on layouts in general, see \link{http://igraph.org/r/doc/layout_.html}. +#' on layouts in general, see \link{https://igraph.org/python/doc/tutorial/tutorial.html#layout-algorithms}. #' To set the graph attribute on your network, run the following code while replacing \code{layout.to.set} #' to your liking: \code{network = igraph::set.graph.attribute(network, "layout", layout.to.set)}. +#' Note that \code{layout.to.set} refers to one of the "short names" of the recpective igraph layout, as +#' specified on the Web site in the link given above. #' #' Note: The names for the vertex types are taken from the variables \code{PLOT.VERTEX.TYPE.AUTHOR} and #' \code{PLOT.VERTEX.TYPE.ARTIFACT}. The defaults are \code{"Developer"} and \code{TYPE.ARTIFACT}, respectively. @@ -91,11 +96,13 @@ plot.print.network = function(network, labels = TRUE) { #' Construct a ggplot2/ggraph plot object for the given network. #' -#' As a layout, by default, \code{igraph::layout.kamada.kawai} (also known as \code{igraph::layout_with_kk}) +#' As a layout, by default, the "kk" layout from igraph (also known as "layout_kamada_kawai") is used, #' is used, unless a graph attribute "layout" is set. For a comprehensive list of layouts and more information -#' on layouts in general, see \link{http://igraph.org/r/doc/layout_.html}. +#' on layouts in general, see \link{https://igraph.org/python/doc/tutorial/tutorial.html#layout-algorithms}. #' To set the graph attribute on your network, run the following code while replacing \code{layout.to.set} #' to your liking: \code{network = igraph::set.graph.attribute(network, "layout", layout.to.set)}. +#' Note that \code{layout.to.set} refers to one of the "short names" of the recpective igraph layout, as +#' specified on the Web site in the link given above. #' #' Note: The names for the vertex types are taken from the variables \code{PLOT.VERTEX.TYPE.AUTHOR} and #' \code{PLOT.VERTEX.TYPE.ARTIFACT}. The defaults are \code{"Developer"} and \code{TYPE.ARTIFACT}, respectively. @@ -123,13 +130,14 @@ plot.get.plot.for.network = function(network, labels = TRUE) { ## fix the type attributes (add new ones, also named) network = plot.fix.type.attributes(network) - ## set network layout + ## set igraph network layout if no layout is set yet if (!("layout" %in% igraph::list.graph.attributes(network))) { - network = igraph::set.graph.attribute(network, "layout", igraph::layout.kamada.kawai) + network = igraph::set.graph.attribute(network, "layout", "kk") } + layout.algorithm = igraph::get.graph.attribute(network, "layout") - ## create a ggraph object - p = ggraph::ggraph(network) + ## create a ggraph object using the specified igraph layout + p = ggraph::ggraph(network, layout = layout.algorithm) ## plot edges if there are any if (igraph::ecount(network) > 0) { diff --git a/util-split.R b/util-split.R index 0b2e4b66..778d6501 100644 --- a/util-split.R +++ b/util-split.R @@ -17,6 +17,7 @@ ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2020 by Thomas Bock ## All Rights Reserved. @@ -39,7 +40,7 @@ requireNamespace("lubridate") # for date conversion #' #' @param project.data the *Data object from which the data is retrieved #' @param time.period the time period describing the length of the ranges, a character string, -#' e.g., "3 mins" or "15 days" +#' e.g., "3 mins" or "15 days" [default: "3 months"] #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an #' *exclusive* manner). If set, the 'time.period' parameter is ignored; consequently, 'split.basis' and #' 'sliding.window' do not make sense then either. [default: NULL] @@ -48,7 +49,7 @@ requireNamespace("lubridate") # for date conversion #' consequently, 'split.basis' and 'sliding.window' do not make sense then either. #' [default: NULL] #' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' -#' [default: commits] +#' [default: "commits"] #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach #' [default: FALSE] #' @@ -113,78 +114,73 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = bins.ranges = construct.ranges(bins) names(bins.ranges) = bins.ranges - ## split data - data.split = parallel::mclapply(split.data, function(df.name) { - logging::logdebug("Splitting %s.", df.name) - ## identify bins for data - df = data[[df.name]] - df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE) - ## split data according to df.bins - df.split = split(df, df.bins) - ## add proper labels/names - names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) - return(df.split) - }) - - ## re-arrange data to get the proper list of data per range - logging::logdebug("Re-arranging data.") - data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) - names(data.split) = bins.ranges - - ## adapt project configuration - project.data$get.project.conf()$set.revisions(bins, bins.date) - - ## construct RangeData objects - logging::logdebug("Constructing RangeData objects.") - cf.data = parallel::mclapply(bins.ranges, function(range) { - logging::logdebug("Constructing data for range %s.", range) - ## construct object for current range - cf.range.data = RangeData$new(project.data$get.project.conf(), range) - ## get data for current range - df.list = data.split[[range]] - - ## set main data sources: commits, mails, issues - for (data.source in split.data) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](df.list[[data.source]]) - } - ## set additional data sources: authors, pasta, synchronicity - for (data.source in additional.data.sources) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](additional.data[[data.source]]) - } - - return(cf.range.data) - }) - - ## perform additional steps for sliding-window approach - ## (only if there is more than one range until here) - if (sliding.window && length(bins.ranges) <= 1) { + if ((length(bins.ranges) <= 1) && sliding.window) { logging::logwarn("Sliding-window approach does not apply for one range or less.") - } else if (sliding.window) { - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = mapply( - bins.date[1:(length(bins.date) - 1)], - bins.date[2:length(bins.date)], - FUN = function(d1, d2) d1 + ((d2 - d1) / 2) - ) - bins.date.middle = get.date.from.unix.timestamp(bins.date.middle) - - ## split data for sliding windows - cf.data.sliding = split.data.time.based(project.data, bins = bins.date.middle, - split.basis = split.basis, sliding.window = FALSE) + sliding.window = FALSE + } - ## append data to normally-split data - cf.data = append(cf.data, cf.data.sliding) + if (!sliding.window) { + + ## split data + data.split = parallel::mclapply(split.data, function(df.name) { + logging::logdebug("Splitting %s.", df.name) + ## identify bins for data + df = data[[df.name]] + df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE) + ## split data according to df.bins + df.split = split(df, df.bins) + ## add proper labels/names + names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) + return(df.split) + }) - ## sort data object properly by bin starts - bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) - cf.data = cf.data[ order(bins.ranges.start) ] + ## re-arrange data to get the proper list of data per range + logging::logdebug("Re-arranging data.") + data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) + names(data.split) = bins.ranges + + ## adapt project configuration + project.data$get.project.conf()$set.revisions(bins, bins.date) + + ## construct RangeData objects + logging::logdebug("Constructing RangeData objects.") + cf.data = parallel::mclapply(bins.ranges, function(range) { + logging::logdebug("Constructing data for range %s.", range) + ## construct object for current range + cf.range.data = RangeData$new(project.data$get.project.conf(), range) + ## get data for current range + df.list = data.split[[range]] + + ## set main data sources: commits, mails, issues + for (data.source in split.data) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](df.list[[data.source]]) + } + ## set additional data sources: authors, pasta, synchronicity + for (data.source in additional.data.sources) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](additional.data[[data.source]]) + } + + return(cf.range.data) + }) - ## construct proper bin vectors for configuration - bins.date = sort(c(bins.date, bins.date.middle)) + } else { + ## perform different steps for sliding-window approach + + ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = FALSE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = TRUE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) bins = get.date.string(bins.date) + logging::loginfo("Splitting data '%s' into time ranges using sliding windows [%s].", + project.data$get.class.name(), ranges) + cf.data = split.data.time.based.by.ranges(project.data, ranges) + ## update project configuration project.data$get.project.conf()$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { @@ -217,7 +213,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = #' #' @param project.data the *Data object from which the data is retrieved #' @param activity.type the type of activity used for splitting, either 'commits', 'mails', or 'issues' -#' [default: commits] +#' [default: "commits"] #' @param activity.amount the amount of activity describing the size of the ranges, a numeric, further #' specified by 'activity.type' [default: 5000] #' @param number.windows the number of consecutive data objects to get from this function @@ -297,21 +293,30 @@ split.data.activity.based = function(project.data, activity.type = c("commits", } else if (sliding.window) { ## get the list of unique items that are used for the bin computation and, thus, also the ## cropping of data - items.unique = unique(data[[ activity.type ]][[ id.column[[activity.type]] ]]) + items.unique = unique(data[[activity.type]][[ id.column[[activity.type]] ]]) items.unique.count = length(items.unique) ## offsets used for cropping (half the first/last bin) offset.start = floor(activity.amount / 2) - offset.end = floor((items.unique.count %% activity.amount) / 2) + offset.end = (items.unique.count - offset.start) %% activity.amount ## cut the data appropriately - items.cut = c( - items.unique[1:offset.start], - items.unique[(items.unique.count - offset.end):items.unique.count] - ) + if (offset.end > 0) { + items.cut = c( + items.unique[seq_len(offset.start)], + items.unique[seq(from = (items.unique.count - offset.end + 1), to = items.unique.count)] + ) + } else { + items.cut = items.unique[seq_len(offset.start)] + } + + ## determine end bin of last sliding-window range + end.event.id = items.unique[(items.unique.count - offset.end + 1)] + end.event.logical = (data[[activity.type]][[ id.column[[activity.type]] ]] == end.event.id) + end.event.date = unique(data[[activity.type]][end.event.logical, ][["date"]]) ## store the data again - data.to.cut = data[[ activity.type ]][[ id.column[[activity.type]] ]] %in% items.cut - data[[ activity.type ]] = data[[ activity.type ]][ !data.to.cut, ] + data.to.cut = data[[activity.type]][[ id.column[[activity.type]] ]] %in% items.cut + data[[activity.type]] = data[[activity.type]][ !data.to.cut, ] ## clone the project data and update raw data to split it again project.data.clone = project.data$clone() @@ -337,6 +342,37 @@ split.data.activity.based = function(project.data, activity.type = c("commits", bins.date = sort(c(bins.date, bins.date.middle)) bins = get.date.string(bins.date) + ## if the last regular range and the last sliding-window range end at the same time + ## and the data of the last regular range is contained in the last sliding-window range, then: + ## remove the last regular range as it is not complete and we don't loose data when removing it + last.regular.range = cf.data[[length(cf.data)]] + last.sliding.range = cf.data[[length(cf.data) - 1]] + get.activity.data = paste0("get.", activity.type) + last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] + last.sliding.range.ids = (last.sliding.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] + if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] + && all(last.regular.range.ids %in% last.sliding.range.ids) ) { + + cf.data = cf.data[-length(cf.data)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } else if (bins.date[length(bins.date)] != bins.date.middle[length(bins.date.middle)]) { + ## adjust the end date of the last sliding-window range, as it might be shorter than it should be: + ## The end of the last range usually is one second after the last event (as end dates are exclusive). + ## In case of sliding windows, the end of the last sliding range needs to be extended to the date of the + ## next event after that range (as end dates are exclusive) to get a full range as for all the previous + ## ranges which end at the beginning of the next range, which is the date of the first event after the + ## actual range. + + ## When we have sliding windows, there are, at least, three ranges (two regular ranges and one + ## sliding-window range. Hence, there are always more than three elements in the bins vector, so accessing + ## bins[length(bins) - 3] cannot throw errors in this case. + name.last.sliding.window = construct.ranges(c(bins[length(bins) - 3], get.date.string(end.event.date))) + names(cf.data)[length(cf.data) - 1] = name.last.sliding.window + bins.date[length(bins.date) - 1] = end.event.date + bins[length(bins) - 1] = get.date.string(end.event.date) + } + ## update project configuration project.data$get.project.conf()$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { @@ -497,9 +533,10 @@ split.data.time.based.by.ranges = function(project.data, ranges) { #' #' @param network the igraph network to split, needs to have an edge attribute named "date" #' @param time.period the time period describing the length of the ranges, a character string, -#' e.g., "3 mins" or "15 days" +#' e.g., "3 mins" or "15 days" [default: "3 months"] #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an #' *exclusive* manner). If set, the 'time.period' and 'sliding.window' parameters are ignored. +#' [default: NULL] #' @param number.windows the number of consecutive networks to get from this function, implying equally #' time-sized windows for all ranges. If set, the 'time.period' and 'bins' parameters are ignored; #' consequently, 'sliding.window' does not make sense then either. @@ -519,7 +556,7 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU if (!is.null(number.windows)) { ## reset bins for the later algorithm bins = NULL - ## remove sliding windows + ## ignore sliding windows sliding.window = FALSE } @@ -530,56 +567,32 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU bins.vector = bins.info[["vector"]] bins.date = get.date.from.string(bins.info[["bins"]]) bins = head(bins.info[["bins"]], -1) - ## logging - logging::loginfo("Splitting network into time ranges [%s].", - paste(bins.info[["bins"]], collapse = ", ")) } else { - ## remove sliding windows + ## specific bins are given, do not use sliding windows sliding.window = FALSE ## find bins for dates bins.date = get.date.from.string(bins) bins.vector = findInterval(dates, bins.date, all.inside = FALSE) - bins = 1:(length(bins.date) - 1) # the last item just closes the last bin - ## logging - logging::loginfo("Splitting network into bins [%s].", paste(bins.date, collapse = ", ")) + bins = seq_len(length(bins.date) - 1) # the last item just closes the last bin } - nets = split.network.by.bins(network, bins, bins.vector, remove.isolates) - ## perform additional steps for sliding-window approach if (sliding.window) { - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = mapply( - bins.date[1:(length(bins.date) - 1)], - bins.date[2:length(bins.date)], - FUN = function(d1, d2) d1 + ((d2 - d1) / 2) - ) - bins.date.middle = get.date.from.unix.timestamp(bins.date.middle) - - ## order edges by date - edges.all = igraph::E(network) - edges.dates = igraph::get.edge.attribute(network, "date") - - ## identify edges to cut for sliding-window approach - edges.cut = sapply(edges.dates, function(date) { - date < bins.date.middle[1] || date > bins.date.middle[length(bins.date.middle)] - }) - - ## delete edges from the network and create a new network - network.cut = igraph::delete.edges(network, edges.all[edges.cut]) - - ## split network for sliding windows - nets.sliding = split.network.time.based(network.cut, bins = bins.date.middle, sliding.window = FALSE) + ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = FALSE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = time.period, overlap = 0.5, raw = TRUE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) - ## append data to normally-split data - nets = append(nets, nets.sliding) - - ## sort data object properly by bin starts - bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) - nets = nets[ order(bins.ranges.start) ] - - ## construct proper bin vectors for configuration - bins.date = sort(c(bins.date, bins.date.middle)) + logging::loginfo("Splitting network into time ranges [%s].", + paste(ranges, collapse = ", ")) + nets = split.network.time.based.by.ranges(network, ranges, remove.isolates) + } else { + logging::loginfo("Splitting network into bins [%s].", + paste(bins.date, collapse = ", ")) + nets = split.network.by.bins(network, bins, bins.vector, remove.isolates) } ## set bin attribute @@ -607,9 +620,10 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU #' #' @param networks the igraph networks to split, needs to have an edge attribute named "date" #' @param time.period the time period describing the length of the ranges, a character string, -#' e.g., "3 mins" or "15 days" +#' e.g., "3 mins" or "15 days" [default: "3 months"] #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an #' *exclusive* manner). If set, the 'time.period' and 'sliding.window' parameters are ignored. +#' [default: NULL] #' @param number.windows the number of consecutive networks to get for each network, implying equally #' time-sized windows for all ranges. If set, the 'time.period' and 'bins' parameters are ignored; #' consequently, 'sliding.window' does not make sense then either. @@ -624,12 +638,11 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = number.windows = NULL, sliding.window = FALSE, remove.isolates = TRUE) { - ## number of windows given (ignoring time period and bins) if (!is.null(number.windows)) { ## reset bins for the later algorithm bins = NULL - ## remove sliding windows + ## ignore sliding windows sliding.window = FALSE } @@ -644,10 +657,20 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = dates = get.date.from.unix.timestamp(dates) ## 2) get bin information - bins.info = split.get.bins.time.based(dates, time.period, number.windows) - bins.date = get.date.from.string(bins.info[["bins"]]) + if (sliding.window) { + ranges = construct.overlapping.ranges(start = min(dates), end = max(dates), + time.period = time.period, overlap = 0.5, raw = FALSE, + include.end.date = TRUE) + bins.info = construct.overlapping.ranges(start = min(dates), end = max(dates), + time.period = time.period, overlap = 0.5, raw = TRUE, + include.end.date = TRUE) + bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) + } else { + bins.info = split.get.bins.time.based(dates, time.period, number.windows) + bins.date = get.date.from.string(bins.info[["bins"]]) + } } else { - ## remove sliding windows + ## specific bins are given, do not use sliding windows sliding.window = FALSE ## set the bins to use bins.date = bins @@ -655,8 +678,16 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = ## split all networks to the extracted bins networks.split = lapply(networks, function(net) { - split.network.time.based(net, bins = bins.date, sliding.window = sliding.window, - remove.isolates = remove.isolates) + + if (sliding.window) { + nets = split.network.time.based.by.ranges(network = net, ranges = ranges, + remove.isolates = remove.isolates) + attr(nets, "bins") = bins.date + } else { + nets = split.network.time.based(network = net, bins = bins.date, sliding.window = sliding.window, + remove.isolates = remove.isolates) + } + return(nets) }) ## return the split networks @@ -674,7 +705,7 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = #' #' @param network the igraph network to split #' @param number.edges the amount of edges describing the size of the ranges -#' (implying an open number of resulting ranges) +#' (implying an open number of resulting ranges) [default: 5000] #' @param number.windows the number of consecutive networks to get from this function #' (implying an equally distributed amount of edges in each range and #' 'sliding.window = FALSE) [default: NULL] @@ -734,27 +765,39 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## split network by bins networks = split.network.by.bins(network, bins, bins.vector, remove.isolates) + if (number.edges >= edge.count) { + logging::logwarn("Sliding-window approach does not apply: not enough edges (%s) for number of edges %s", + edge.count, number.edges) + sliding.window = FALSE + } + ## perform additional steps for sliding-window approach ## for activity-based sliding-window bins to work, we need to crop edges appropriately and, ## then, compute bins on the cropped networks if (sliding.window) { - ## order edges by date - edges.by.date = igraph::E(network)[ order(df[["date"]]) ] + + ## get edge ids ordered by date + edges.by.date = df[["my.unique.id"]] ## offsets used for cropping (half the first/last bin) offset.start = floor(number.edges / 2) - offset.end = floor((edge.count %% number.edges) / 2) + offset.end = (edge.count - offset.start) %% number.edges ## cut the data appropriately - edges.cut = c( - edges.by.date[1:offset.start], - edges.by.date[(edge.count - offset.end):edge.count] - ) + if (offset.end > 0) { + edges.cut = c( + edges.by.date[seq_len(offset.start)], + edges.by.date[seq(from = (edge.count - offset.end + 1), to = edge.count)] + ) + } else { + edges.cut = edges.by.date[seq_len(offset.start)] + } ## delete edges from the network and create a new network - network.cut = igraph::delete.edges(network, edges.cut) + network.cut = igraph::delete.edges(network, igraph::E(network)[edges.cut]) ## split network for sliding windows - networks.sliding = split.network.activity.based(network.cut, number.edges = number.edges, sliding.window = FALSE) + networks.sliding = split.network.activity.based(network.cut, number.edges = number.edges, + sliding.window = FALSE) ## append data to normally-split data networks = append(networks, networks.sliding) @@ -768,6 +811,21 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## construct proper bin vectors for configuration bins.date = sort(c(bins.date, bins.date.middle)) + + ## if the last regular range and the last sliding-window range end at the same time + ## and the latter contains the former's edges, then: + ## remove the last regular range as it is not complete and we don't loose data when removing it + edges.last.regular = igraph::E(networks[[length(networks)]]) + edges.last.sliding = igraph::E(networks[[length(networks) - 1]]) + if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] + && all(edges.last.regular %in% edges.last.sliding) + && table(edges.last.regular$date) %in% table(edges.last.sliding$date) ) { + + networks = networks[-length(networks)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } + } ## set bin attribute @@ -847,7 +905,7 @@ split.data.by.bins = function(df, bins) { #' #' @return a list of networks, with the length of 'unique(bins.vector)' split.network.by.bins = function(network, bins, bins.vector, remove.isolates = TRUE) { - logging::logdebug("split.data.time.based: starting.") + logging::logdebug("split.network.by.bins: starting.") ## create a network for each bin of edges nets = parallel::mclapply(bins, function(bin) { logging::logdebug("Splitting network: bin %s", bin) @@ -857,7 +915,7 @@ split.network.by.bins = function(network, bins, bins.vector, remove.isolates = T g = igraph::subgraph.edges(network, edges, delete.vertices = remove.isolates) return(g) }) - logging::logdebug("split.data.time.based: finished.") + logging::logdebug("split.network.by.bins: finished.") return(nets) } @@ -969,7 +1027,7 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica bins.number.complete = length(ids.unique) %/% activity.amount bins.number.incomplete = length(ids.unique) %% activity.amount bins.activity = c( - if (bins.number.complete != 0) rep(1:bins.number.complete, each = activity.amount), + if (bins.number.complete != 0) rep(seq_len(bins.number.complete), each = activity.amount), rep(bins.number.complete + 1, bins.number.incomplete) ) bins.number = max(bins.activity) @@ -981,7 +1039,7 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica ) ## get the start (and end) date for all bins - bins.date = parallel::mclapply(1:bins.number, function(bin) { + bins.date = parallel::mclapply(seq_len(bins.number), function(bin) { ## get the ids in the bin ids = bins.mapping[ bins.mapping[["bin"]] == bin, "id"] ## grab dates for the ids