diff --git a/tests/test-split-data-activity-based.R b/tests/test-split-data-activity-based.R index f190adef..9fa282ee 100644 --- a/tests/test-split-data-activity-based.R +++ b/tests/test-split-data-activity-based.R @@ -663,8 +663,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10", "2016-07-12 16:00:45-2016-07-12 16:06:20", "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:20-2016-07-12 16:06:33", - "2016-07-12 16:06:32-2016-07-12 16:06:33" + "2016-07-12 16:06:20-2016-07-12 16:06:33" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, @@ -682,8 +681,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.basis = "commits", split.sliding.window = TRUE, split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33", - "2016-07-12 16:06:33"), + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -698,44 +696,38 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:8, ] + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] ), commit.messages = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ] ), mails = list( ## comments indicate row names when pasta is not configured "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( @@ -778,7 +770,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 18, split.basis = "commits", - split.sliding.window = TRUE, + split.sliding.window = FALSE, split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), split.revision.dates = NULL ) @@ -864,7 +856,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10", "2016-07-12 16:00:45-2016-07-12 16:06:20", "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:20-2016-07-12 16:06:33", + "2016-07-12 16:06:20-2016-07-12 16:06:32", "2016-07-12 16:06:32-2016-07-12 16:06:33" ) lapply(results, function(res) { @@ -883,7 +875,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.basis = "commits", split.sliding.window = TRUE, split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), split.revision.dates = NULL ) @@ -899,21 +891,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commits[5:8, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:9, ] ), commit.messages = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commit.messages, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -921,21 +913,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$mails[0, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$synchronicity, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity ) ) @@ -997,8 +989,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40", "2010-07-12 12:05:45-2016-07-12 15:58:50", "2016-07-12 15:58:40-2016-07-12 16:05:37", - "2016-07-12 15:58:50-2016-07-12 16:05:38", - "2016-07-12 16:05:37-2016-07-12 16:05:38" + "2016-07-12 15:58:50-2016-07-12 16:05:38" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -1017,8 +1008,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.revisions = c("2004-10-09 18:38:13", "2005-02-09 18:49:49", "2010-07-12 11:05:35", "2010-07-12 12:05:34", "2010-07-12 12:05:41", "2010-07-12 12:05:42", "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2016-07-12 15:58:40", - "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38", - "2016-07-12 16:05:38"), + "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -1039,8 +1029,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commits[0, ], "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$commits[0, ], "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commits[1:2, ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commits[0, ] + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commits[1:2, ] ), commit.messages = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commit.messages, @@ -1052,8 +1041,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commit.messages, "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$commit.messages, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commit.messages, - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commit.messages, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commit.messages + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], @@ -1065,8 +1053,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:15, 31:32, 48:49), ], "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:15, 31:32, 48:49), ], "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1079,8 +1066,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[10:12, ], # rownames(data$mails) %in% 10:12 "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[11:13, ], # rownames(data$mails) %in% c(11:12, 14) "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[13:15, ], # rownames(data$mails) %in% 14:16 - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[14:16, ], # rownames(data$mails) %in% 15:17 - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$mails[16, ] # rownames(data$mails) %in% 17 + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[14:16, ] # rownames(data$mails) %in% 15:17 ), pasta = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, @@ -1092,8 +1078,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$pasta, "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$pasta, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$pasta, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$pasta + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$pasta ), synchronicity = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, @@ -1105,8 +1090,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$synchronicity, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$synchronicity + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( @@ -1149,7 +1133,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 26, split.basis = "mails", - split.sliding.window = TRUE, + split.sliding.window = FALSE, split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), split.revision.dates = NULL ) @@ -1237,8 +1221,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00", "2016-07-15 19:55:39-2016-08-31 16:45:09", "2016-08-07 15:30:00-2017-05-23 12:31:34", - "2016-08-31 16:45:09-2017-05-23 12:32:40", - "2017-05-23 12:31:34-2017-05-23 12:32:40" + "2016-08-31 16:45:09-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -1257,8 +1240,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.revisions = c("2013-04-21 23:52:09", "2013-05-06 01:04:34", "2013-05-25 06:22:23", "2013-06-01 06:54:00", "2016-07-12 15:59:25", "2016-07-12 16:01:30", "2016-07-12 16:04:59", "2016-07-15 19:55:39", "2016-08-07 15:30:00", - "2016-08-31 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40", - "2017-05-23 12:32:40"), + "2016-08-31 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -1279,8 +1261,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$commits[3:8, ], "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$commits[0, ], "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$commits[0, ], - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commits[0, ], - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commits[0, ] + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commits[0, ] ), commit.messages = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, @@ -1292,8 +1273,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$commit.messages, "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$commit.messages, "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$commit.messages, - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commit.messages, - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commit.messages + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], @@ -1305,8 +1285,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$issues[rownames(data$issues) %in% c(18:21, 25, 30, 46:47, 50:51), ], "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$issues[rownames(data$issues) %in% c(18:21, 26, 29:30, 34, 41, 52), ], "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(26:29, 34:38, 41, 52), ], - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(27:28, 35:40), ], - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(39:40), ] + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(27:28, 35:40), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1319,8 +1298,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$mails[16, ], # rownames(data$mails) %in% 16 "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$mails[0, ], "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$mails[0, ], - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$mails[0, ], - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$mails[0, ] + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, @@ -1332,8 +1310,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$pasta, "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$pasta, "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$pasta, - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$pasta, - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$pasta + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, @@ -1345,8 +1322,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$synchronicity, "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$synchronicity, "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$synchronicity, - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$synchronicity, - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$synchronicity + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -1389,7 +1365,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 65, split.basis = "issues", - split.sliding.window = TRUE, + split.sliding.window = FALSE, split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), split.revision.dates = NULL ) diff --git a/util-split.R b/util-split.R index 36e27c97..84970d73 100644 --- a/util-split.R +++ b/util-split.R @@ -71,16 +71,18 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = #' Split project data in activity-bin-based ranges as specified #' -#' @param project.data the *Data object from which the data is retrieved +#' @param project.data the project data object from which the data is retrieved #' @param activity.amount the amount of data elements with unique ids to be considered in a bin, an integer. #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an -#' *exclusive* manner), augmented with a bin vector mapping unique ids to bins. -#' [default: NULL] +#' *exclusive* manner), including a vector which maps elements of the \code{split.basis} column of +#' \code{project.data} to bins, as produced by \code{split.get.bins.activity.based}. #' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' #' [default: "commits"] #' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}. #' #' @return the list of RangeData objects, each referring to one bin +#' +#' @seealso split.get.bins.activity.based split.data.by.bins.vector = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"), sliding.window) { split = split.data.by.time.or.bins(project.data, activity.amount, bins, split.by.time = FALSE, @@ -92,13 +94,13 @@ split.data.by.bins.vector = function(project.data, activity.amount, bins, split. #' #' @param project.data the *Data object from which the data is retrieved #' @param splitting.length either \code{time.period} from \code{split.data.time.based} -#' or \code{splitting.length} from\code{split.data.by.bins.vector} -#' @param bins either \code{bins} from \code{split.data.time.based} -#' or \code{bins} from\code{split.data.by.bins.vector} -#' @param split.by.time logical indicating whether splitting is done time-based or by activity-bins-based, +#' or \code{activity.amount} from\code{split.data.by.bins.vector} +#' @param bins either formatted as the \code{bins} parameter of \code{split.data.time.based} +#' or as the \code{bins} parameter of \code{split.data.by.bins.vector} +#' @param split.by.time logical indicating whether splitting is done time-based or activity-bins-based #' @param number.windows see \code{number.windows} from \code{split.data.time.by.bins.vector} #' [default: NULL] -#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', or 'issues' #' [default: "commits"] #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach #' [default: FALSE] @@ -107,6 +109,9 @@ split.data.by.bins.vector = function(project.data, activity.amount, bins, split. #' [default: NULL] #' #' @return the list of RangeData objects, each referring to one time period +#' +#' @seealso split.data.time.based +#' @seealso split.data.by.bins.vector split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time, number.windows = NULL, split.basis = c("commits", "mails", "issues"), sliding.window = FALSE, project.conf.new = NULL) { @@ -147,8 +152,8 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli sliding.window = FALSE } - ## initiate variable - split.by.bins = FALSE + ## indicates if time-based splitting is performed using bins + split.time.based.with.bins = FALSE ## if bins are NOT given explicitly if (is.null(bins)) { @@ -162,12 +167,15 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli ## when bins are given explicitly, get bins based on parameter else { if (split.by.time) { + split.time.based.with.bins = TRUE split.basis = NULL - split.by.bins = TRUE - sliding.window = FALSE bins = get.date.from.string(bins) bins = get.date.string(bins) + ## remove sliding windows + sliding.window = FALSE } else { + ## sliding windows do not need to be removed here, as sliding windows and bins + ## are not contradicting in activity-based splitting bins.vector = bins[["vector"]] bins = bins[["bins"]] } @@ -182,7 +190,7 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli bins.ranges = construct.ranges(bins) names(bins.ranges) = bins.ranges - if (split.by.time && (length(bins.ranges) <= 1) && sliding.window) { + if ((length(bins.ranges) <= 1) && sliding.window) { logging::logwarn("Sliding-window approach does not apply for one range or less.") sliding.window = FALSE } @@ -245,7 +253,7 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli }) } else { - ## perform different steps for sliding-window approach + ## perform different steps for sliding-window approach of time-based splitting ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), time.period = splitting.length, overlap = 0.5, raw = FALSE, @@ -271,7 +279,7 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli ## add splitting information to project configuration project.conf.new$set.splitting.info( type = if (split.by.time) "time-based" else "activity-based", - length = if (split.by.bins) { + length = if (split.time.based.with.bins) { bins } else { @@ -416,7 +424,9 @@ split.data.activity.based = function(project.data, activity.type = c("commits", logging::loginfo("Splitting data '%s' into activity ranges of %s %s (%s windows).", project.data$get.class.name(), activity.amount, activity.type, number.windows) - ## get bins based on split.basis + ## get bins based on split.basis. Here the include.duplicate.ids parameter flag must be set, to + ## retrieve bins which map every event to a bin including events with non-unique ids. This is important + ## to ensure that every range really has activity.amount many entries after splitting logging::logdebug("Getting activity-based bins.") bins.data = split.get.bins.activity.based(data[[activity.type]], id.column[[activity.type]], activity.amount, remove.duplicate.bins = TRUE, include.duplicate.ids = TRUE) @@ -425,7 +435,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## split the data based on the extracted timestamps logging::logdebug("Splitting data based on time windows arising from activity bins.") - cf.data = split.data.by.bins.vector(project.data, bins = bins.data, activity.amount = activity.amount, + cf.data = split.data.by.bins.vector(project.data, bins = bins.data, activity.amount = activity.amount, sliding.window = sliding.window, split.basis = activity.type) ## perform additional steps for sliding-window approach: @@ -440,30 +450,9 @@ split.data.activity.based = function(project.data, activity.type = c("commits", items.unique = unique(data[[activity.type]][[ id.column[[activity.type]] ]]) items.unique.count = length(items.unique) - ## offsets used for cropping (half the first/last bin) + ## offsets used for cropping (half of the first bin) offset.start = floor(activity.amount / 2) - offset.end = (items.unique.count - offset.start) %% activity.amount - - # make sure that end offset does not go above one window - last.window = cf.data[[length(cf.data)]][[DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]]]() - length.of.last.window = length(unique(last.window[[ id.column[[activity.type]] ]])) - - offset.end = max(c(length.of.last.window - offset.start, 0)) - - ## cut the data appropriately - if (offset.end > 0) { - items.cut = c( - items.unique[seq_len(offset.start)], - items.unique[seq(from = (items.unique.count - offset.end + 1), to = items.unique.count)] - ) - } else { - items.cut = items.unique[seq_len(offset.start)] - } - - ## determine end bin of last sliding-window range - end.event.id = items.unique[(items.unique.count - offset.end + 1)] - end.event.logical = (data[[activity.type]][[ id.column[[activity.type]] ]] == end.event.id) - end.event.date = unique(data[[activity.type]][end.event.logical, ][["date"]]) + items.cut = items.unique[seq_len(offset.start)] ## store the data again data.to.cut = data[[activity.type]][[ id.column[[activity.type]] ]] %in% items.cut @@ -480,12 +469,34 @@ split.data.activity.based = function(project.data, activity.type = c("commits", activity.amount = activity.amount, sliding.window = FALSE, project.conf.new = project.conf.new) + ## extract bins + bins.date.middle = attr(cf.data.sliding, "bins") + + ## Both, the last sliding range and the last regular range end at the very last item. + ## This is the case because the end of the data is never cropped (like the beginning is). + ## split.data.activity.based, which is invoked to obtain both set of ranges, creates + ## ranges until all elements are in one. + ## + ## The conditional below inspects whether the very last item is in the first or the second + ## half of the last regular range. If it is in the first half, there will be a sliding + ## window which covers all items of the last regular range which makes the last regular + ## range obsolete. + ## Similarely if the last item is in the second half of the last regular range, there + ## will be a sliding range (which started at the half of the last regular range) which + ## contains only items also included in the last regular range, which makes the sliding + ## range obsolete. + if (((items.unique.count - 1) %% (activity.amount)) >= (offset.start)) { + cf.data.sliding = cf.data.sliding[-length(cf.data.sliding)] + bins.date.middle = bins.date.middle[-length(bins.date.middle)] + } else { + cf.data = cf.data[-length(cf.data)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } + ## append data to normally-split data cf.data = append(cf.data, cf.data.sliding) - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = attr(cf.data.sliding, "bins") - ## sort data object properly by bin starts bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) cf.data = cf.data[ order(bins.ranges.start) ] @@ -494,38 +505,6 @@ split.data.activity.based = function(project.data, activity.type = c("commits", bins.date = sort(c(bins.date, bins.date.middle)) bins = get.date.string(bins.date) - ## if the last regular range and the last sliding-window range end at the same time - ## and the data of the last regular range is contained in the last sliding-window range, then: - ## remove the last regular range as it is not complete and we don't loose data when removing it - last.regular.range = cf.data[[length(cf.data)]] - last.sliding.range = cf.data.sliding[[length(cf.data.sliding) - 1]] - get.activity.data = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]] - - last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - last.sliding.range.ids = (last.sliding.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] - && all(last.regular.range.ids %in% last.sliding.range.ids) ) { - - cf.data = cf.data[-length(cf.data)] - bins.date = bins.date[-length(bins.date)] - bins = bins[-length(bins)] - } else if (bins.date[length(bins.date)] != bins.date.middle[length(bins.date.middle)]) { - ## adjust the end date of the last sliding-window range, as it might be shorter than it should be: - ## The end of the last range usually is one second after the last event (as end dates are exclusive). - ## In case of sliding windows, the end of the last sliding range needs to be extended to the date of the - ## next event after that range (as end dates are exclusive) to get a full range as for all the previous - ## ranges which end at the beginning of the next range, which is the date of the first event after the - ## actual range. - - ## When we have sliding windows, there are, at least, three ranges (two regular ranges and one - ## sliding-window range. Hence, there are always more than three elements in the bins vector, so accessing - ## bins[length(bins) - 3] cannot throw errors in this case. - name.last.sliding.window = construct.ranges(c(bins[length(bins) - 3], get.date.string(end.event.date))) - names(cf.data)[length(cf.data) - 1] = name.last.sliding.window - bins.date[length(bins.date) - 1] = end.event.date - bins[length(bins) - 1] = get.date.string(end.event.date) - } - ## update project configuration project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { @@ -1171,7 +1150,7 @@ split.get.bins.time.based = function(dates, time.period, number.windows = NULL) #' [default: FALSE] #' #' @return a list, -#' the item 'vector': the bins each row in 'df' belongs to (increasing integers),q +#' the item 'vector': the bins each row in 'df' belongs to (increasing integers), #' the item 'bins': the bin labels, described by dates, each bin containing #' 'activity.amount' many unique items; each item in the vector indicates #' the start of a bin, although the last item indicates the end of the last bin