From 48ef4fa685adf6e5d85281e5b90a8ed8f6aeb197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Wed, 18 Oct 2023 15:38:17 +0200 Subject: [PATCH] Rework sliding window approach of 'split.data.activity.based' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework the algorithm to create sliding windows in activity-based splitting. Instead of cutting off half a range many elements at the end before building sliding windows (which creates a lot of edge cases), build sliding windows with every element up to the last one. Then remove the last incomplete range. The contents of the last incomplete range will be fully included in the second last range and therefore redundant. Sometimes the last incomplete range is a regular range. Previously the last range always had to be a regular range. This means that removing the last incomplete range requires updating the tests. Additionally fix and improve documentation of splitting methods and fix minor spelling bugs. This works towards #239. Signed-off-by: Maximilian Löffler --- tests/test-split-data-activity-based.R | 94 +++++++----------- util-split.R | 131 +++++++++++-------------- 2 files changed, 90 insertions(+), 135 deletions(-) diff --git a/tests/test-split-data-activity-based.R b/tests/test-split-data-activity-based.R index f190adef..9fa282ee 100644 --- a/tests/test-split-data-activity-based.R +++ b/tests/test-split-data-activity-based.R @@ -663,8 +663,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10", "2016-07-12 16:00:45-2016-07-12 16:06:20", "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:20-2016-07-12 16:06:33", - "2016-07-12 16:06:32-2016-07-12 16:06:33" + "2016-07-12 16:06:20-2016-07-12 16:06:33" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, @@ -682,8 +681,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.basis = "commits", split.sliding.window = TRUE, split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33", - "2016-07-12 16:06:33"), + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -698,44 +696,38 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:8, ] + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] ), commit.messages = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ] ), mails = list( ## comments indicate row names when pasta is not configured "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( @@ -778,7 +770,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 18, split.basis = "commits", - split.sliding.window = TRUE, + split.sliding.window = FALSE, split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), split.revision.dates = NULL ) @@ -864,7 +856,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10", "2016-07-12 16:00:45-2016-07-12 16:06:20", "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:20-2016-07-12 16:06:33", + "2016-07-12 16:06:20-2016-07-12 16:06:32", "2016-07-12 16:06:32-2016-07-12 16:06:33" ) lapply(results, function(res) { @@ -883,7 +875,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.basis = "commits", split.sliding.window = TRUE, split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), split.revision.dates = NULL ) @@ -899,21 +891,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commits[5:8, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:9, ] ), commit.messages = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commit.messages, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -921,21 +913,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$mails[0, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$synchronicity, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity ) ) @@ -997,8 +989,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40", "2010-07-12 12:05:45-2016-07-12 15:58:50", "2016-07-12 15:58:40-2016-07-12 16:05:37", - "2016-07-12 15:58:50-2016-07-12 16:05:38", - "2016-07-12 16:05:37-2016-07-12 16:05:38" + "2016-07-12 15:58:50-2016-07-12 16:05:38" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -1017,8 +1008,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.revisions = c("2004-10-09 18:38:13", "2005-02-09 18:49:49", "2010-07-12 11:05:35", "2010-07-12 12:05:34", "2010-07-12 12:05:41", "2010-07-12 12:05:42", "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2016-07-12 15:58:40", - "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38", - "2016-07-12 16:05:38"), + "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -1039,8 +1029,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commits[0, ], "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$commits[0, ], "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commits[1:2, ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commits[0, ] + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commits[1:2, ] ), commit.messages = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commit.messages, @@ -1052,8 +1041,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commit.messages, "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$commit.messages, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commit.messages, - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commit.messages, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commit.messages + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$commit.messages ), issues = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], @@ -1065,8 +1053,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:15, 31:32, 48:49), ], "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:15, 31:32, 48:49), ], "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1079,8 +1066,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[10:12, ], # rownames(data$mails) %in% 10:12 "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[11:13, ], # rownames(data$mails) %in% c(11:12, 14) "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[13:15, ], # rownames(data$mails) %in% 14:16 - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[14:16, ], # rownames(data$mails) %in% 15:17 - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$mails[16, ] # rownames(data$mails) %in% 17 + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[14:16, ] # rownames(data$mails) %in% 15:17 ), pasta = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, @@ -1092,8 +1078,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$pasta, "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$pasta, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$pasta, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$pasta + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$pasta ), synchronicity = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, @@ -1105,8 +1090,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$synchronicity, "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$synchronicity + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( @@ -1149,7 +1133,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 26, split.basis = "mails", - split.sliding.window = TRUE, + split.sliding.window = FALSE, split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), split.revision.dates = NULL ) @@ -1237,8 +1221,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00", "2016-07-15 19:55:39-2016-08-31 16:45:09", "2016-08-07 15:30:00-2017-05-23 12:31:34", - "2016-08-31 16:45:09-2017-05-23 12:32:40", - "2017-05-23 12:31:34-2017-05-23 12:32:40" + "2016-08-31 16:45:09-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -1257,8 +1240,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.revisions = c("2013-04-21 23:52:09", "2013-05-06 01:04:34", "2013-05-25 06:22:23", "2013-06-01 06:54:00", "2016-07-12 15:59:25", "2016-07-12 16:01:30", "2016-07-12 16:04:59", "2016-07-15 19:55:39", "2016-08-07 15:30:00", - "2016-08-31 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40", - "2017-05-23 12:32:40"), + "2016-08-31 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -1279,8 +1261,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$commits[3:8, ], "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$commits[0, ], "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$commits[0, ], - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commits[0, ], - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commits[0, ] + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commits[0, ] ), commit.messages = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, @@ -1292,8 +1273,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$commit.messages, "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$commit.messages, "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$commit.messages, - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commit.messages, - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commit.messages + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], @@ -1305,8 +1285,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$issues[rownames(data$issues) %in% c(18:21, 25, 30, 46:47, 50:51), ], "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$issues[rownames(data$issues) %in% c(18:21, 26, 29:30, 34, 41, 52), ], "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(26:29, 34:38, 41, 52), ], - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(27:28, 35:40), ], - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(39:40), ] + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(27:28, 35:40), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1319,8 +1298,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$mails[16, ], # rownames(data$mails) %in% 16 "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$mails[0, ], "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$mails[0, ], - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$mails[0, ], - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$mails[0, ] + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, @@ -1332,8 +1310,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$pasta, "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$pasta, "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$pasta, - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$pasta, - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$pasta + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, @@ -1345,8 +1322,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:04:59-2016-08-07 15:30:00" = data$synchronicity, "2016-07-15 19:55:39-2016-08-31 16:45:09" = data$synchronicity, "2016-08-07 15:30:00-2017-05-23 12:31:34" = data$synchronicity, - "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$synchronicity, - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$synchronicity + "2016-08-31 16:45:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -1389,7 +1365,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 65, split.basis = "issues", - split.sliding.window = TRUE, + split.sliding.window = FALSE, split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), split.revision.dates = NULL ) diff --git a/util-split.R b/util-split.R index 36e27c97..84970d73 100644 --- a/util-split.R +++ b/util-split.R @@ -71,16 +71,18 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = #' Split project data in activity-bin-based ranges as specified #' -#' @param project.data the *Data object from which the data is retrieved +#' @param project.data the project data object from which the data is retrieved #' @param activity.amount the amount of data elements with unique ids to be considered in a bin, an integer. #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an -#' *exclusive* manner), augmented with a bin vector mapping unique ids to bins. -#' [default: NULL] +#' *exclusive* manner), including a vector which maps elements of the \code{split.basis} column of +#' \code{project.data} to bins, as produced by \code{split.get.bins.activity.based}. #' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' #' [default: "commits"] #' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}. #' #' @return the list of RangeData objects, each referring to one bin +#' +#' @seealso split.get.bins.activity.based split.data.by.bins.vector = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"), sliding.window) { split = split.data.by.time.or.bins(project.data, activity.amount, bins, split.by.time = FALSE, @@ -92,13 +94,13 @@ split.data.by.bins.vector = function(project.data, activity.amount, bins, split. #' #' @param project.data the *Data object from which the data is retrieved #' @param splitting.length either \code{time.period} from \code{split.data.time.based} -#' or \code{splitting.length} from\code{split.data.by.bins.vector} -#' @param bins either \code{bins} from \code{split.data.time.based} -#' or \code{bins} from\code{split.data.by.bins.vector} -#' @param split.by.time logical indicating whether splitting is done time-based or by activity-bins-based, +#' or \code{activity.amount} from\code{split.data.by.bins.vector} +#' @param bins either formatted as the \code{bins} parameter of \code{split.data.time.based} +#' or as the \code{bins} parameter of \code{split.data.by.bins.vector} +#' @param split.by.time logical indicating whether splitting is done time-based or activity-bins-based #' @param number.windows see \code{number.windows} from \code{split.data.time.by.bins.vector} #' [default: NULL] -#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', or 'issues' #' [default: "commits"] #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach #' [default: FALSE] @@ -107,6 +109,9 @@ split.data.by.bins.vector = function(project.data, activity.amount, bins, split. #' [default: NULL] #' #' @return the list of RangeData objects, each referring to one time period +#' +#' @seealso split.data.time.based +#' @seealso split.data.by.bins.vector split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time, number.windows = NULL, split.basis = c("commits", "mails", "issues"), sliding.window = FALSE, project.conf.new = NULL) { @@ -147,8 +152,8 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli sliding.window = FALSE } - ## initiate variable - split.by.bins = FALSE + ## indicates if time-based splitting is performed using bins + split.time.based.with.bins = FALSE ## if bins are NOT given explicitly if (is.null(bins)) { @@ -162,12 +167,15 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli ## when bins are given explicitly, get bins based on parameter else { if (split.by.time) { + split.time.based.with.bins = TRUE split.basis = NULL - split.by.bins = TRUE - sliding.window = FALSE bins = get.date.from.string(bins) bins = get.date.string(bins) + ## remove sliding windows + sliding.window = FALSE } else { + ## sliding windows do not need to be removed here, as sliding windows and bins + ## are not contradicting in activity-based splitting bins.vector = bins[["vector"]] bins = bins[["bins"]] } @@ -182,7 +190,7 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli bins.ranges = construct.ranges(bins) names(bins.ranges) = bins.ranges - if (split.by.time && (length(bins.ranges) <= 1) && sliding.window) { + if ((length(bins.ranges) <= 1) && sliding.window) { logging::logwarn("Sliding-window approach does not apply for one range or less.") sliding.window = FALSE } @@ -245,7 +253,7 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli }) } else { - ## perform different steps for sliding-window approach + ## perform different steps for sliding-window approach of time-based splitting ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), time.period = splitting.length, overlap = 0.5, raw = FALSE, @@ -271,7 +279,7 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli ## add splitting information to project configuration project.conf.new$set.splitting.info( type = if (split.by.time) "time-based" else "activity-based", - length = if (split.by.bins) { + length = if (split.time.based.with.bins) { bins } else { @@ -416,7 +424,9 @@ split.data.activity.based = function(project.data, activity.type = c("commits", logging::loginfo("Splitting data '%s' into activity ranges of %s %s (%s windows).", project.data$get.class.name(), activity.amount, activity.type, number.windows) - ## get bins based on split.basis + ## get bins based on split.basis. Here the include.duplicate.ids parameter flag must be set, to + ## retrieve bins which map every event to a bin including events with non-unique ids. This is important + ## to ensure that every range really has activity.amount many entries after splitting logging::logdebug("Getting activity-based bins.") bins.data = split.get.bins.activity.based(data[[activity.type]], id.column[[activity.type]], activity.amount, remove.duplicate.bins = TRUE, include.duplicate.ids = TRUE) @@ -425,7 +435,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## split the data based on the extracted timestamps logging::logdebug("Splitting data based on time windows arising from activity bins.") - cf.data = split.data.by.bins.vector(project.data, bins = bins.data, activity.amount = activity.amount, + cf.data = split.data.by.bins.vector(project.data, bins = bins.data, activity.amount = activity.amount, sliding.window = sliding.window, split.basis = activity.type) ## perform additional steps for sliding-window approach: @@ -440,30 +450,9 @@ split.data.activity.based = function(project.data, activity.type = c("commits", items.unique = unique(data[[activity.type]][[ id.column[[activity.type]] ]]) items.unique.count = length(items.unique) - ## offsets used for cropping (half the first/last bin) + ## offsets used for cropping (half of the first bin) offset.start = floor(activity.amount / 2) - offset.end = (items.unique.count - offset.start) %% activity.amount - - # make sure that end offset does not go above one window - last.window = cf.data[[length(cf.data)]][[DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]]]() - length.of.last.window = length(unique(last.window[[ id.column[[activity.type]] ]])) - - offset.end = max(c(length.of.last.window - offset.start, 0)) - - ## cut the data appropriately - if (offset.end > 0) { - items.cut = c( - items.unique[seq_len(offset.start)], - items.unique[seq(from = (items.unique.count - offset.end + 1), to = items.unique.count)] - ) - } else { - items.cut = items.unique[seq_len(offset.start)] - } - - ## determine end bin of last sliding-window range - end.event.id = items.unique[(items.unique.count - offset.end + 1)] - end.event.logical = (data[[activity.type]][[ id.column[[activity.type]] ]] == end.event.id) - end.event.date = unique(data[[activity.type]][end.event.logical, ][["date"]]) + items.cut = items.unique[seq_len(offset.start)] ## store the data again data.to.cut = data[[activity.type]][[ id.column[[activity.type]] ]] %in% items.cut @@ -480,12 +469,34 @@ split.data.activity.based = function(project.data, activity.type = c("commits", activity.amount = activity.amount, sliding.window = FALSE, project.conf.new = project.conf.new) + ## extract bins + bins.date.middle = attr(cf.data.sliding, "bins") + + ## Both, the last sliding range and the last regular range end at the very last item. + ## This is the case because the end of the data is never cropped (like the beginning is). + ## split.data.activity.based, which is invoked to obtain both set of ranges, creates + ## ranges until all elements are in one. + ## + ## The conditional below inspects whether the very last item is in the first or the second + ## half of the last regular range. If it is in the first half, there will be a sliding + ## window which covers all items of the last regular range which makes the last regular + ## range obsolete. + ## Similarely if the last item is in the second half of the last regular range, there + ## will be a sliding range (which started at the half of the last regular range) which + ## contains only items also included in the last regular range, which makes the sliding + ## range obsolete. + if (((items.unique.count - 1) %% (activity.amount)) >= (offset.start)) { + cf.data.sliding = cf.data.sliding[-length(cf.data.sliding)] + bins.date.middle = bins.date.middle[-length(bins.date.middle)] + } else { + cf.data = cf.data[-length(cf.data)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } + ## append data to normally-split data cf.data = append(cf.data, cf.data.sliding) - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = attr(cf.data.sliding, "bins") - ## sort data object properly by bin starts bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) cf.data = cf.data[ order(bins.ranges.start) ] @@ -494,38 +505,6 @@ split.data.activity.based = function(project.data, activity.type = c("commits", bins.date = sort(c(bins.date, bins.date.middle)) bins = get.date.string(bins.date) - ## if the last regular range and the last sliding-window range end at the same time - ## and the data of the last regular range is contained in the last sliding-window range, then: - ## remove the last regular range as it is not complete and we don't loose data when removing it - last.regular.range = cf.data[[length(cf.data)]] - last.sliding.range = cf.data.sliding[[length(cf.data.sliding) - 1]] - get.activity.data = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]] - - last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - last.sliding.range.ids = (last.sliding.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] - && all(last.regular.range.ids %in% last.sliding.range.ids) ) { - - cf.data = cf.data[-length(cf.data)] - bins.date = bins.date[-length(bins.date)] - bins = bins[-length(bins)] - } else if (bins.date[length(bins.date)] != bins.date.middle[length(bins.date.middle)]) { - ## adjust the end date of the last sliding-window range, as it might be shorter than it should be: - ## The end of the last range usually is one second after the last event (as end dates are exclusive). - ## In case of sliding windows, the end of the last sliding range needs to be extended to the date of the - ## next event after that range (as end dates are exclusive) to get a full range as for all the previous - ## ranges which end at the beginning of the next range, which is the date of the first event after the - ## actual range. - - ## When we have sliding windows, there are, at least, three ranges (two regular ranges and one - ## sliding-window range. Hence, there are always more than three elements in the bins vector, so accessing - ## bins[length(bins) - 3] cannot throw errors in this case. - name.last.sliding.window = construct.ranges(c(bins[length(bins) - 3], get.date.string(end.event.date))) - names(cf.data)[length(cf.data) - 1] = name.last.sliding.window - bins.date[length(bins.date) - 1] = end.event.date - bins[length(bins) - 1] = get.date.string(end.event.date) - } - ## update project configuration project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { @@ -1171,7 +1150,7 @@ split.get.bins.time.based = function(dates, time.period, number.windows = NULL) #' [default: FALSE] #' #' @return a list, -#' the item 'vector': the bins each row in 'df' belongs to (increasing integers),q +#' the item 'vector': the bins each row in 'df' belongs to (increasing integers), #' the item 'bins': the bin labels, described by dates, each bin containing #' 'activity.amount' many unique items; each item in the vector indicates #' the start of a bin, although the last item indicates the end of the last bin