diff --git a/NEWS.md b/NEWS.md index b19c21ef..1c5b7f50 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,27 @@ # coronet – Changelog +## unversioned + +### Added + +- Add issue-based artifact-networks (PR #244, 98a93ee721a293410623aafe46890cfba9d81e72, 771bcc8d961d419b53a1e891e9dc536371f1143b) +- Add a new `split.data.by.bins` function (not to be confused with a previously existing function that had the same name and was renamed in this context), which splits data based on given activity-based bins (PR #244, ece569ceaf557bb38cd0cfad437b69b30fe8a698, ed5feb214a123b605c9513262f187cfd72b9e1f4) + +### Changed/Improved + +- Enhance testing data by adding `add_link` and `referenced_by` issue events which connect issues to form edges in issue-based artifact-networks (PR #244, 9f840c040d552e8639aa82c3dd537c189679b348, ea4fe8d3c84f948af6147cf0137e80181ebb7a1e) +- Add input validation for the `bins` parameter in `split.data.time.based` and `split.data.by.bins` (PR #244, ed0a5302ea8c8934d7200b95be7ac1446305af07, 5e5ecbac44d07927b953ae9d4330a616f8224ba7) +- Rename `split.data.by.bins` into `split.dataframe.by.bins` as this it what it does (PR #244, ed5feb214a123b605c9513262f187cfd72b9e1f4) + +### Fixed + +- Reformat `event.info.1` column of issue data according to the format, if the content of the `event.info.1` field references another issue (PR #244, 62ff9d0f31adbefb3381936237dc4ab984e33acb) +- Fix an issue in activity-based splitting where elements close to the border of bins might be assigned to the wrong bin. The issue was caused by the usage of `split.data.time.based` inside `split.data.activity.based` to split data into the previously derived bins, when elements close to bin borders share the same timestamps. It is fixed by replacing `split.data.time.based` by `split.data.by.bins` (PR #244, ece569ceaf557bb38cd0cfad437b69b30fe8a698) +- Remove the last range when using a sliding-window approach and the last range's elements are fully contained in the second last range (PR #244, 48ef4fa685adf6e5d85281e5b90a8ed8f6aeb197) +- Rename vertex attribute `IssueEvent` to `Issue` in multi-networks, to be consistent with bipartite-networks (PR #244, 26d7b7e9fd6d33d1c0a8a08f19c5c2e30346a3d9) + + ## 4.3 ### Added diff --git a/tests/codeface-data/results/testing/test_feature/feature/issues-github.list b/tests/codeface-data/results/testing/test_feature/feature/issues-github.list index 5bb9f215..12d2c0a0 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/issues-github.list +++ b/tests/codeface-data/results/testing/test_feature/feature/issues-github.list @@ -5,6 +5,8 @@ 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"930af63a030fb92e48eddff01f53284c3eeba80e";"""commit""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Karl";"karl@example.org";"2016-08-31 16:45:09";"";"""""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Thomas";"thomas@example.org";"2016-10-05 16:45:09";"";"""""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"6";"""issue""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"2";"""issue""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"mentioned";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 16:03:59";"open";"[]" @@ -15,6 +17,7 @@ 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"Björn";"bjoern@example.org";"2016-12-07 15:30:02";"udo";"""udo@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"labeled";"Olaf";"olaf@example.org";"2017-05-23 12:31:34";"decided";"""""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Björn";"bjoern@example.org";"2017-05-23 12:32:39";"open";"[]" +6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"referenced_by";"Karl";"karl@example.org";"2016-08-07 15:37:02";"3";"""issue""" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"created";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"state_updated";"Thomas";"thomas@example.org";"2016-07-12 15:59:59";"closed";"""open""" @@ -25,6 +28,7 @@ "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"commented";"Björn";"bjoern@example.org";"2016-07-12 14:59:25";"open";"[]" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"merged";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"";"""""" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"state_updated";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"closed";"""open""" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"referenced_by";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"3";"""issue""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commit_added";"Björn";"bjoern@example.org";"2016-07-12 15:58:59";"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"""""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"created";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commented";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" diff --git a/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list b/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list index 3740aa58..c39f31ed 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list +++ b/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list @@ -11,6 +11,8 @@ "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-05-25 06:22:23";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-06-01 06:50:26";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"resolution_updated";"Björn";"bjoern@example.org";"2013-06-01 06:53:06";"fixed";"""unresolved""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"created";"Björn";"bjoern@example.org";"2016-07-12 16:01:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-12 16:02:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-15 19:55:39";"open";"[""unresolved""]" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list b/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list index 5bb9f215..12d2c0a0 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list @@ -5,6 +5,8 @@ 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"930af63a030fb92e48eddff01f53284c3eeba80e";"""commit""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Karl";"karl@example.org";"2016-08-31 16:45:09";"";"""""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Thomas";"thomas@example.org";"2016-10-05 16:45:09";"";"""""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"6";"""issue""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"2";"""issue""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"mentioned";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 16:03:59";"open";"[]" @@ -15,6 +17,7 @@ 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"Björn";"bjoern@example.org";"2016-12-07 15:30:02";"udo";"""udo@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"labeled";"Olaf";"olaf@example.org";"2017-05-23 12:31:34";"decided";"""""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Björn";"bjoern@example.org";"2017-05-23 12:32:39";"open";"[]" +6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"referenced_by";"Karl";"karl@example.org";"2016-08-07 15:37:02";"3";"""issue""" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"created";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"state_updated";"Thomas";"thomas@example.org";"2016-07-12 15:59:59";"closed";"""open""" @@ -25,6 +28,7 @@ "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"commented";"Björn";"bjoern@example.org";"2016-07-12 14:59:25";"open";"[]" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"merged";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"";"""""" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"state_updated";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"closed";"""open""" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"referenced_by";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"3";"""issue""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commit_added";"Björn";"bjoern@example.org";"2016-07-12 15:58:59";"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"""""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"created";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commented";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list b/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list index 3740aa58..c39f31ed 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list @@ -11,6 +11,8 @@ "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-05-25 06:22:23";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-06-01 06:50:26";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"resolution_updated";"Björn";"bjoern@example.org";"2013-06-01 06:53:06";"fixed";"""unresolved""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"created";"Björn";"bjoern@example.org";"2016-07-12 16:01:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-12 16:02:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-15 19:55:39";"open";"[""unresolved""]" diff --git a/tests/test-core-peripheral.R b/tests/test-core-peripheral.R index 07c7389c..c9397d6f 100644 --- a/tests/test-core-peripheral.R +++ b/tests/test-core-peripheral.R @@ -17,6 +17,7 @@ ## Copyright 2022 by Thomas Bock ## Copyright 2019 by Christian Hechtl ## Copyright 2021 by Christian Hechtl +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -171,8 +172,8 @@ test_that("Issue-count classification" , { result = get.author.class.issue.count(proj.data, issue.type = "all") ## Assert - expected.core = data.frame(author.name = c("Björn", "Olaf", "Thomas"), issue.count = c(6, 6, 4)) - expected.peripheral = data.frame(author.name = c("Karl", "Max", "udo"), issue.count = c(1, 1, 1)) + expected.core = data.frame(author.name = c("Björn", "Olaf", "Thomas"), issue.count = c(6, 6, 6)) + expected.peripheral = data.frame(author.name = c("Karl", "Max", "udo"), issue.count = c(2, 1, 1)) expected = list(core = expected.core, peripheral = expected.peripheral) row.names(result[["core"]]) = NULL diff --git a/tests/test-data.R b/tests/test-data.R index 1d06a34c..9c6f4f8c 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -19,6 +19,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -402,8 +403,8 @@ test_that("Filter bots from issue data", { filtered.issues = proj.data$get.issues() expect_true(all(filtered.issues[["author.name"]] != "Thomas")) - ## there are now 41 issue events remaining, since 6 issue events have been removed during filtering - expect_equal(nrow(filtered.issues), 41) + ## there are now 43 issue events remaining, since 10 issue events have been removed during filtering + expect_equal(nrow(filtered.issues), 43) }) test_that("Filter bots from mail data", { diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index 8eaebaf8..d4c70ec6 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -101,3 +101,97 @@ test_that("Network construction of the undirected artifact-cochange network", { ## test expect_true(igraph::identical_graphs(network.built, network.expected)) }) + +patrick::with_parameters_test_that("Network construction of an issue-based artifact-network", { + ## build expected network: + ## 1) vertices + vertices = data.frame(name = c("", + "", + "", + "", + "", + "" , + ""), + kind = "Issue", + type = TYPE.ARTIFACT) + ## 2) edges + edges = data.frame( + from = c("", "", ""), + to = c("", "", ""), + date = get.date.from.string(c("2016-08-07 15:30:00", "2016-08-07 15:37:02", "2017-05-21 12:00:00")), + artifact.type = c("IssueEvent", "IssueEvent", "IssueEvent"), + issue.id = c("", "", ""), + event.name = c("add_link", "add_link", "add_link"), + author.name = c("Thomas", "Karl", "Thomas"), + weight = c(1, 1, 1), + type = TYPE.EDGES.INTRA, + relation = "issue" + ) + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(updated.values = list(artifact.relation = "issue", artifact.directed = test.directed)) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) + + ## build expected network + network.expected = igraph::graph.data.frame(edges, directed = test.directed, vertices = vertices) + + ## build network + network.built = network.builder$get.artifact.network() + + ## test + expect_true(igraph::identical_graphs(network.built, network.expected)) +}, patrick::cases( + "directed: FALSE" = list(test.directed = FALSE), + "directed: TRUE" = list(test.directed = TRUE) +)) + +patrick::with_parameters_test_that("Network construction of an empty 'comments-only' issue-based artifact-network", { + + ## + ## 'issues.only.comments' (by default), this should not create any edges + ## + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + net.conf = NetworkConf$new() + net.conf$update.values(updated.values = list(artifact.relation = "issue", artifact.directed = test.directed)) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) + + ## build network + network.built = network.builder$get.artifact.network() + + ## 1) vertices + vertices = data.frame(name = c("", + "", + "", + "", + "", + "", + ""), + kind = "Issue", + type = TYPE.ARTIFACT) + ## 2) edges + edges = data.frame( + from = character(), to = character(), date = get.date.from.string(character(0)), artifact.type = character(), + issue.id = character(), event.name = character(), weight = numeric(), type = character(), + relation = character() + ) + + ## build expected network + network.expected = igraph::graph.data.frame(edges, directed = test.directed, vertices = vertices) + + ## test + compare.networks(network.built, network.expected) +}, patrick::cases( + "directed: FALSE" = list(test.directed = FALSE), + "directed: TRUE" = list(test.directed = TRUE) +)) diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index 613c1862..f2c250a9 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -21,6 +21,7 @@ ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -488,75 +489,93 @@ test_that("Network construction of the undirected author-issue network with all ## edge attributes edges = data.frame(from = c(rep("Thomas", 5), rep("Thomas", 4), rep("Olaf", 3), # - rep("Olaf", 4), # - rep("Karl", 6), rep("Karl", 5), rep("Olaf", 3), # + rep("Olaf", 4), rep("Thomas", 3), rep("Thomas", 3), # + rep("Olaf", 7), rep("Thomas", 7), rep("Thomas", 4), # rep("Olaf", 3), # - rep("udo", 4), rep("udo", 7), rep("udo", 3), rep("Thomas", 7), rep("Thomas", 3), rep("Björn", 6), # - rep("Thomas", 9), rep("Thomas", 6), rep("Björn", 11), # - rep("Björn", 6) # + rep("Thomas", 4), rep("Karl", 3), rep("Björn", 7), rep("Olaf", 3), rep("Thomas", 3), rep("Thomas", 7), + rep("Thomas", 3), rep("Björn", 6), rep("Olaf", 2), rep("Olaf", 6), # + rep("Thomas", 10), rep("Thomas", 7), rep("Olaf", 11), # + rep("Björn", 6), rep("Thomas", 4), rep("Thomas", 4) # ), to = c(rep("Olaf", 5), rep("Björn", 4), rep("Björn", 3), # - rep("Björn", 4), # - rep("Olaf", 6), rep("Thomas", 5), rep("Thomas", 3), # + rep("Björn", 4), rep("Björn", 3), rep("Olaf", 3), # + rep("Karl", 7), rep("Karl", 7), rep("Olaf", 4), # rep("Björn", 3), # - rep("Thomas", 4), rep("Björn", 7), rep("Olaf", 3), rep("Björn", 7), rep("Olaf", 3), rep("Olaf", 6), # - rep("Björn", 9), rep("Olaf", 6), rep("Olaf", 11), # - rep("Max", 6) # + rep("udo", 4), rep("udo", 3), rep("udo", 7), rep("udo", 3), rep("Karl", 3), rep("Björn", 7), + rep("Olaf", 3), rep("Karl", 6), rep("Karl", 2), rep("Björn", 6), # + rep("Björn", 10), rep("Olaf", 7), rep("Björn", 11), # + rep("Max", 6), rep("Björn", 4), rep("Max", 4) # ), date = get.date.from.string(c( "2016-07-12 15:59:25", "2016-07-12 15:59:25", "2016-07-12 15:59:59", # "2016-07-12 16:01:01", "2016-07-14 13:37:00", "2016-07-12 15:59:25", "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-07-12 16:06:01", "2016-07-12 16:01:01", "2016-07-14 13:37:00", "2016-07-12 16:06:01", "2016-07-12 14:59:25", "2016-07-12 14:59:25", "2016-07-12 16:04:59", # - "2016-07-12 16:04:59", + "2016-07-12 16:04:59", "2016-07-12 14:59:25", "2016-07-12 14:59:25", + "2016-08-07 15:30:00", "2016-07-12 16:04:59", "2016-07-12 16:04:59", + "2016-08-07 15:30:00", "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-08-07 15:37:02", # - "2016-08-31 16:45:09", "2016-07-12 15:59:25", "2016-07-12 16:06:30", - "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-08-07 15:37:02", - "2016-08-31 16:45:09", "2016-10-05 16:45:09", "2016-07-12 15:59:25", - "2016-07-12 16:06:30", "2016-10-05 16:45:09", + "2016-08-07 15:37:02", "2016-08-31 16:45:09", "2016-07-12 15:59:25", + "2016-07-12 16:06:30", "2016-07-12 15:59:25", "2016-07-12 15:59:59", + "2016-08-07 15:37:02", "2016-08-07 15:37:02", "2016-08-31 16:45:09", + "2016-08-07 15:30:00", "2016-10-05 16:45:09", "2016-07-12 15:59:25", + "2016-07-12 16:06:30", "2016-08-07 15:30:00", "2016-10-05 16:45:09", "2016-07-12 16:02:02", "2016-07-12 16:02:02", "2016-07-12 16:02:02", # "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2016-07-12 16:03:59", # "2016-10-13 15:30:02", "2016-07-12 15:30:02", "2016-07-12 15:30:02", + "2016-08-07 15:37:02", "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:32:39", "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2017-05-23 12:31:34", "2016-07-12 16:03:59", + "2016-10-13 15:30:02", "2016-08-07 15:37:02", "2016-07-12 16:03:59", "2016-10-13 15:30:02", "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:32:39", "2016-07-12 16:03:59", "2016-10-13 15:30:02", "2017-05-23 12:31:34", - "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-12-07 15:30:02", - "2016-12-07 15:30:02", "2017-05-23 12:32:39", "2017-05-23 12:31:34", - "2013-04-21 23:52:09", "2013-04-21 23:52:09", "2013-05-05 21:46:30", # - "2013-05-05 21:49:21", "2013-05-05 21:49:34", "2013-05-06 01:04:34", - "2013-05-25 03:48:41", "2013-05-25 04:08:07", "2013-06-01 06:53:06", - "2013-04-21 23:52:09", "2013-04-21 23:52:09", "2013-05-25 03:25:06", - "2013-05-25 06:06:53", "2013-05-25 06:22:23", "2013-06-01 06:50:26", + "2016-08-07 15:37:02", "2016-08-31 15:30:02", "2016-10-05 15:30:02", + "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:32:39", + "2016-08-07 15:37:02", "2017-05-23 12:31:34", "2016-08-31 15:30:02", + "2016-10-05 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", + "2017-05-23 12:32:39", "2017-05-23 12:31:34", + "2013-04-21 23:52:09", "2013-04-21 23:52:09", "2017-05-21 12:00:00", # "2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", "2013-05-06 01:04:34", "2013-05-25 03:48:41", "2013-05-25 04:08:07", - "2013-06-01 06:53:06", "2013-05-25 03:25:06", "2013-05-25 06:06:53", - "2013-05-25 06:22:23", "2013-06-01 06:50:26", + "2013-06-01 06:53:06", "2013-04-21 23:52:09", "2013-04-21 23:52:09", + "2017-05-21 12:00:00", "2013-05-25 03:25:06", "2013-05-25 06:06:53", + "2013-05-25 06:22:23", "2013-06-01 06:50:26", "2013-05-05 21:46:30", + "2013-05-05 21:49:21", "2013-05-05 21:49:34", "2013-05-06 01:04:34", + "2013-05-25 03:48:41", "2013-05-25 04:08:07", "2013-06-01 06:53:06", + "2013-05-25 03:25:06", "2013-05-25 06:06:53", "2013-05-25 06:22:23", + "2013-06-01 06:50:26", "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", # - "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52" + "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52", + "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", + "2017-05-21 12:00:00", "2016-07-15 20:07:47", "2016-07-27 20:12:08", + "2016-07-28 06:27:52", "2017-05-21 12:00:00" )), artifact.type = "IssueEvent", - issue.id = c( rep("", 12), rep("", 4), rep("", 14), - rep("", 3), rep("", 30), rep("", 26), - rep("", 6)), + issue.id = c(rep("", 12), rep("", 10), rep("", 18), + rep("", 3), rep("", 44), rep("", 28), + rep("", 14)), event.name = c("created", "commented", "state_updated", "commented", "state_updated", "created", # "commented", "state_updated", "commented", "commented", "state_updated", "commented", - "created", "commented", "merged", "state_updated", # - "created", "commented", "add_link", "referenced", "assigned", "state_updated", "created", # - "commented", "add_link", "referenced", "referenced", "assigned", "state_updated", "referenced", + "created", "commented", "merged", "state_updated", "created", "commented", "referenced_by", # + "merged", "state_updated", "referenced_by", + "created", "commented", "add_link", "add_link", "referenced", "assigned", "state_updated", "created", # + "commented", "add_link", "add_link", "referenced", "add_link", "referenced", "assigned", "state_updated", "add_link", + "referenced", "commit_added", "created", "commented", # - "mentioned", "subscribed", "commented", "add_link", "mentioned", "subscribed", "mentioned", # - "subscribed", "mentioned", "subscribed", "commented", "mentioned", "subscribed", "labeled", - "commented", "add_link", "mentioned", "subscribed", "mentioned", "subscribed", "commented", - "commented", "add_link", "labeled", "mentioned", "subscribed", "mentioned", "subscribed", - "commented", "labeled", - "created", "commented", "commented", "commented", "commented", "commented", "commented", # - "commented", "resolution_updated", "created", "commented", "commented", "commented", + "mentioned", "subscribed", "commented", "add_link", "mentioned", "subscribed", "referenced_by", # + "mentioned", "subscribed", "mentioned", "subscribed", "mentioned", "subscribed", "commented", + "mentioned", "subscribed", "labeled", "commented", "add_link", "referenced_by", "commented", "add_link", "mentioned", + "subscribed", "mentioned", "subscribed", "commented", "commented", "add_link", "labeled", "referenced_by", + "mentioned", "subscribed", "mentioned", "subscribed", "commented", "referenced_by", "labeled", + "mentioned", "subscribed", "mentioned", "subscribed", "commented", "labeled", + "created", "commented", "referenced_by", "commented", "commented", "commented", "commented", # + "commented", "commented", "resolution_updated", "created", "commented", "referenced_by", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "resolution_updated", "commented", "commented", "commented", "commented", - "created", "commented", "commented", "commented", "commented", "commented" # + "created", "commented", "commented", "commented", "commented", "commented", "created", # + "commented", "commented", "add_link", "commented", "commented", "commented", "add_link" ), weight = 1, type = TYPE.EDGES.INTRA, diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 5ebaf02e..d3c06863 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -21,6 +21,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021-2022 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -597,21 +598,21 @@ test_that("Test add.vertex.attribute.author.issue.count", { networks.and.data = get.network.covariates.test.networks(issues=TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(0L, 1L, 1L), c(2L, 1L, 1L, 1L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(1L, 1L, 1L), c(2L, 1L, 1L, 1L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, 1L, 1L), c(1L, 2L, 1L), c(2L, 1L, 1L, 1L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 2L, 1L), c(2L, 2L, 2L), c(3L, 2L, 1L, 1L)), - project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 2L, 1L), c(2L, 3L, 2L), c(3L, 2L, 1L, 1L)), - complete = network.covariates.test.build.expected(c(3L, 1L, 3L, 1L), c(3L, 3L, 3L), c(3L, 3L, 1L, 1L)) + range = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(0L, 1L, 1L), c(2L, 1L, 1L, 2L, 1L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(1L, 1L, 1L), c(2L, 1L, 2L, 2L, 1L)), + all.ranges = network.covariates.test.build.expected(c(2L, 2L, 1L, 1L), c(1L, 2L, 2L), c(2L, 1L, 2L, 2L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 2L, 1L), c(2L, 2L, 2L), c(3L, 2L, 3L, 2L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(3L, 2L, 2L, 1L), c(2L, 3L, 3L), c(3L, 2L, 3L, 2L, 1L)), + complete = network.covariates.test.build.expected(c(4L, 2L, 3L, 1L), c(3L, 3L, 4L), c(3L, 3L, 4L, 2L, 1L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 0L), c(1L, 1L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 1L), c(2L, 3L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 3L, 0L), c(3L, 2L, 1L), c(2L, 3L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 2L, 1L), c(3L, 3L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 3L, 0L), c(3L, 3L, 1L), c(3L, 3L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 3L, 0L), c(3L, 3L, 1L), c(3L, 3L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 0L), c(1L, 1L, 1L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 1L), c(2L, 3L, 2L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(2L, 0L, 3L, 0L), c(3L, 2L, 2L), c(2L, 3L, 2L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 2L, 1L), c(3L, 3L, 2L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(2L, 0L, 3L, 0L), c(3L, 3L, 2L), c(3L, 3L, 2L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(2L, 0L, 3L, 0L), c(3L, 3L, 2L), c(3L, 3L, 2L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -661,21 +662,21 @@ test_that("Test add.vertex.attribute.author.issues.commented.count", { networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 1L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 1L, 1L)), - project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 1L, 1L)), - complete = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 3L, 2L), c(3L, 1L, 1L, 1L)) + range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 0L, 1L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 1L)), + all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 2L, 1L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 2L, 1L, 1L)), + complete = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 3L, 2L), c(3L, 1L, 2L, 1L, 1L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 1L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 1L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 1L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -724,21 +725,21 @@ test_that("Test add.vertex.attribute.author.issue.creation.count", { networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(0L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 1L, 0L)), - all.ranges = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 1L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 0L)) + range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(0L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 0L, 1L, 0L)), + all.ranges = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 0L, 1L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 0L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 0L), c(0L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 1L), c(0L, 1L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 0L, 1L), c(0L, 1L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 1L, 1L), c(1L, 1L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 0L), c(0L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 1L), c(0L, 1L, 1L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 0L, 1L), c(0L, 1L, 1L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 1L, 1L), c(1L, 1L, 1L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 1L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 1L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -787,21 +788,21 @@ test_that("Test add.vertex.attribute.author.issue.comment.count", { networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 3L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(2L, 0L, 1L, 3L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 2L, 1L), c(2L, 0L, 1L, 3L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 4L, 0L), c(4L, 7L, 2L), c(8L, 4L, 1L, 3L)), - project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 8L, 2L), c(8L, 4L, 1L, 3L)), - complete = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 9L, 2L), c(9L, 4L, 1L, 3L)) + range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 0L, 3L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(2L, 0L, 1L, 1L, 3L)), + all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 2L, 1L), c(2L, 0L, 1L, 1L, 3L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 4L, 0L), c(4L, 7L, 2L), c(8L, 4L, 2L, 1L, 3L)), + project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 8L, 2L), c(8L, 4L, 2L, 1L, 3L)), + complete = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 9L, 2L), c(9L, 4L, 2L, 1L, 3L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 1L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 1L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 1L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1668,21 +1669,21 @@ test_that("Test add.vertex.attribute.issue.contributor.count", { networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 2L, 2L, 1L)), - cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 2L, 2L, 3L)), - all.ranges = network.covariates.test.build.expected(c(3L, 2L, NA), c(NA, 2L, NA, 3L, NA), c(NA, 2L, 2L, 3L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 2L, 2L, 3L)), - project.all.ranges = network.covariates.test.build.expected(c(3L, 2L, NA), c(NA, 2L, NA, 3L, NA), c(NA, 2L, 2L, 3L)), - complete = network.covariates.test.build.expected(c(4L, 3L, NA), c(NA, 2L, NA, 4L, NA), c(NA, 3L, 2L, 4L)) + range = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 3L, 2L, NA, 2L)), + cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 3L, 2L, NA, 4L)), + all.ranges = network.covariates.test.build.expected(c(4L, 3L, NA), c(NA, 2L, NA, 4L, NA), c(NA, 3L, 2L, NA, 4L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 3L, 2L, NA, 4L)), + project.all.ranges = network.covariates.test.build.expected(c(4L, 3L, NA), c(NA, 2L, NA, 4L, NA), c(NA, 3L, 2L, NA, 4L)), + complete = network.covariates.test.build.expected(c(5L, 3L, NA), c(NA, 3L, NA, 5L, NA), c(NA, 3L, 3L, NA, 5L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, NA)), - cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 1L), c(3L, NA, NA, NA)), - all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 1L), c(3L, NA, NA, NA)), - project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)), - project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)), - complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)) + range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, 1L, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 1L), c(3L, NA, NA, 2L, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, 2L, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 2L), c(3L, NA, NA, 3L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 3L), c(3L, NA, NA, 3L, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 3L), c(3L, NA, NA, 3L, NA)) ) expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1794,10 +1795,10 @@ test_that("Test add.vertex.attribute.issue.contributor.count with issues.only.co expected.attributes.issues.only = list( range = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 1L), c(NA, 2L)), cumulative = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 2L), c(NA, 2L)), - all.ranges = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 2L, 3L), c(NA, 2L)), + all.ranges = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 2L, 4L), c(NA, 2L)), project.cumulative = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 2L), c(NA, 2L)), - project.all.ranges = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 2L, 3L), c(NA, 2L)), - complete = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 2L, 4L), c(NA, 2L)) + project.all.ranges = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 2L, 4L), c(NA, 2L)), + complete = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 3L, 5L), c(NA, 3L)) ) expected.attributes.prs.only = list( @@ -1856,21 +1857,21 @@ test_that("Test add.vertex.attribute.issue.event.count", { networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 1L, NA), c(NA, 3L, 4L, 1L)), - cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 6L, 6L, 3L)), - all.ranges = network.covariates.test.build.expected(c(3L, 6L, NA), c(NA, 6L, NA, 3L, NA), c(NA, 6L, 6L, 3L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 6L, 6L, 3L)), - project.all.ranges = network.covariates.test.build.expected(c(3L, 6L, NA), c(NA, 6L, NA, 3L, NA), c(NA, 6L, 6L, 3L)), - complete = network.covariates.test.build.expected(c(8L, 7L, NA), c(NA, 6L, NA, 8L, NA), c(NA, 7L, 6L, 8L)) + range = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 1L, NA), c(NA, 4L, 4L, NA, 2L)), + cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 7L, 6L, NA, 4L)), + all.ranges = network.covariates.test.build.expected(c(4L, 7L, NA), c(NA, 6L, NA, 4L, NA), c(NA, 7L, 6L, NA, 4L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 7L, 6L, NA, 4L)), + project.all.ranges = network.covariates.test.build.expected(c(4L, 7L, NA), c(NA, 6L, NA, 4L, NA), c(NA, 7L, 6L, NA, 4L)), + complete = network.covariates.test.build.expected(c(9L, 8L, NA), c(NA, 7L, NA, 9L, NA), c(NA, 8L, 7L, NA, 9L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(NA, NA, 2L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, NA)), - cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 1L), c(5L, NA, NA, NA)), - all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 1L), c(5L, NA, NA, NA)), - project.cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)), - project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)), - complete = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)) + range = network.covariates.test.build.expected(c(NA, NA, 2L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, 1L, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 1L), c(5L, NA, NA, 2L, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, 2L, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 2L), c(5L, NA, NA, 3L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 3L), c(5L, NA, NA, 3L, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 3L), c(5L, NA, NA, 3L, NA)) ) expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1918,21 +1919,21 @@ test_that("Test add.vertex.attribute.issue.comment.count", { networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 0L, 4L, 0L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - project.cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - complete = network.covariates.test.build.expected(c(2L, 1L, NA), c(NA, 5L, NA, 2L, NA), c(NA, 1L, 5L, 2L)) + range = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 0L, 4L, NA, 0L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + project.cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + complete = network.covariates.test.build.expected(c(2L, 1L, NA), c(NA, 5L, NA, 2L, NA), c(NA, 1L, 5L, NA, 2L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 1L, NA, 0L), c(1L, NA, NA, NA)), - cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 0L), c(3L, NA, NA, NA)), - all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 0L), c(3L, NA, NA, NA)), - project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)), - project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)), - complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)) + range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 1L, NA, 0L), c(1L, NA, NA, 0L, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 0L), c(3L, NA, NA, 0L, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 0L), c(3L, NA, NA, 0L, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 1L), c(3L, NA, NA, 1L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, 1L, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, 1L, NA)) ) expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1991,6 +1992,7 @@ test_that("Test add.vertex.attribute.issue.opened.date", { c(NA, "2016-07-12 15:59:25", "2016-07-12 16:01:30", + NA, "2016-07-12 14:30:13")) expected.attributes.prs.only = network.covariates.test.build.expected( @@ -2005,6 +2007,7 @@ test_that("Test add.vertex.attribute.issue.opened.date", { c("2016-07-14 13:37:00", NA, NA, + "2016-07-12 14:59:25", NA)) expected.attributes.both = network.covariates.test.build.expected( @@ -2019,6 +2022,7 @@ test_that("Test add.vertex.attribute.issue.opened.date", { c("2016-07-14 13:37:00", "2016-07-12 15:59:25", "2016-07-12 16:01:30", + "2016-07-12 14:59:25", "2016-07-12 14:30:13")) ## convert date strings to POSIXct @@ -2088,6 +2092,7 @@ test_that("Test add.vertex.attribute.issue.closed.date", { c(NA, "2016-07-12 16:06:30", NA, + NA, NA)) expected.attributes.prs.only = network.covariates.test.build.expected( @@ -2102,6 +2107,7 @@ test_that("Test add.vertex.attribute.issue.closed.date", { c(NA, NA, NA, + "2016-07-12 16:04:59", NA)) expected.attributes.both = network.covariates.test.build.expected( @@ -2116,6 +2122,7 @@ test_that("Test add.vertex.attribute.issue.closed.date", { c(NA, "2016-07-12 16:06:30", NA, + "2016-07-12 16:04:59", NA)) ## convert date strings to POSIXct @@ -2177,79 +2184,79 @@ test_that("Test add.vertex.attribute.issue.last.activity.date", { range = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", NA), c(NA , "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), project.cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), project.all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", NA), c(NA , "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), complete = network.covariates.test.build.expected( c("2017-05-23 12:32:39", "2016-10-05 16:45:09", NA), - c(NA , "2016-07-28 06:27:52", NA , "2017-05-23 12:32:39", NA), - c(NA , "2016-10-05 16:45:09", "2016-07-28 06:27:52", "2017-05-23 12:32:39"))) + c(NA , "2017-05-21 12:00:00", NA , "2017-05-23 12:32:39", NA), + c(NA , "2016-10-05 16:45:09", "2017-05-21 12:00:00", NA , "2017-05-23 12:32:39"))) expected.attributes.prs.only = list( range = network.covariates.test.build.expected( c(NA , NA , "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), cumulative = network.covariates.test.build.expected( c(NA , NA , "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), all.ranges = network.covariates.test.build.expected( c(NA , NA , "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), project.cumulative = network.covariates.test.build.expected( c(NA , NA , "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), project.all.ranges = network.covariates.test.build.expected( c(NA , NA , "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), complete = network.covariates.test.build.expected( c(NA , NA , "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA))) + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA))) expected.attributes.both = list( range = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), project.cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), project.all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), complete = network.covariates.test.build.expected( c("2017-05-23 12:32:39", "2016-10-05 16:45:09", "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2017-05-23 12:32:39", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-10-05 16:45:09", "2016-07-28 06:27:52", "2017-05-23 12:32:39"))) + c("2016-07-14 13:37:00", "2017-05-21 12:00:00", "2016-07-12 16:02:02", "2017-05-23 12:32:39", "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", "2016-10-05 16:45:09", "2017-05-21 12:00:00", "2016-08-07 15:30:00", "2017-05-23 12:32:39"))) ## convert date strings to POSIXct expected.attributes.issues.only = lapply(expected.attributes.issues.only, function(times) { @@ -2328,6 +2335,7 @@ test_that("Test add.vertex.attribute.issue.title", { c(NA, "Error in construct.networks.from.list for openssl function networks", "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + NA, "Distinguish directedness of networks and edge-construction algorithm")) expected.attributes.prs.only = network.covariates.test.build.expected( @@ -2342,6 +2350,7 @@ test_that("Test add.vertex.attribute.issue.title", { c("Example pull request 1", NA, NA, + "Example pull request 2", NA)) expected.attributes.both = network.covariates.test.build.expected( @@ -2356,6 +2365,7 @@ test_that("Test add.vertex.attribute.issue.title", { c("Example pull request 1", "Error in construct.networks.from.list for openssl function networks", "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + "Example pull request 2", "Distinguish directedness of networks and edge-construction algorithm")) ## Test issues only @@ -2403,7 +2413,7 @@ test_that("Test add.vertex.attribute.pr.open.merged.or.closed", { networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "issue") expected.attributes = network.covariates.test.build.expected( - c(NA, NA, "open"), c("open", NA, "open", NA, "merged"), c("open", NA, NA, NA) + c(NA, NA, "open"), c("open", NA, "open", NA, "merged"), c("open", NA, NA, "merged", NA) ) ## Test @@ -2424,7 +2434,7 @@ test_that("Test add.vertex.attribute.issue.is.pull.request", { networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "issue") expected.attributes = network.covariates.test.build.expected( - c(FALSE, FALSE, TRUE), c(TRUE, FALSE, TRUE, FALSE, TRUE), c(TRUE, FALSE, FALSE, FALSE) + c(FALSE, FALSE, TRUE), c(TRUE, FALSE, TRUE, FALSE, TRUE), c(TRUE, FALSE, FALSE, TRUE, FALSE) ) ## Test diff --git a/tests/test-networks-multi-relation.R b/tests/test-networks-multi-relation.R index c724d155..7cd6ae75 100644 --- a/tests/test-networks-multi-relation.R +++ b/tests/test-networks-multi-relation.R @@ -224,8 +224,8 @@ test_that("Construction of the multi network for the feature artifact with autho ## 1) construct expected vertices vertices = data.frame( name = c("Björn", "Olaf", "Karl", "Thomas", "udo", "Fritz fritz@example.org", "georg", "Hans", - "Base_Feature", "foo", "A", "", "", "", "", - "", "", ""), + "Base_Feature", "foo", "A", "", "", "", + "", "", "", ""), kind = c(rep(TYPE.AUTHOR, 8), rep("Feature", 3), rep("Issue", 7)), type = c(rep(TYPE.AUTHOR, 8), rep(TYPE.ARTIFACT, 10)) ) diff --git a/tests/test-read.R b/tests/test-read.R index 9a597f23..ec8f95bc 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -351,107 +351,110 @@ test_that("Read and parse the issue data.", { issue.data.read.github = read.issues(proj.conf$get.value("datapath.issues"), proj.conf$get.value("issues.from.source")) ## build the expected data.frame - issue.data.expected = data.frame(issue.id = c(rep("", 13), rep("", 6), - rep("", 7), rep("", 10), - rep("", 6), rep("", 4), rep("", 3)), - issue.title = c(rep("[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name", 13), - rep("[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", 6), - rep("Error in construct.networks.from.list for openssl function networks", 7), - rep("Distinguish directedness of networks and edge-construction algorithm", 10), + issue.data.expected = data.frame(issue.id = c(rep("", 14), rep("", 7), + rep("", 9), rep("", 11), + rep("", 6), rep("", 5), rep("", 3)), + issue.title = c(rep("[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name", 14), + rep("[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", 7), + rep("Error in construct.networks.from.list for openssl function networks", 9), + rep("Distinguish directedness of networks and edge-construction algorithm", 11), rep("Example pull request 1", 6), - rep("Example pull request 2", 4), + rep("Example pull request 2", 5), rep("Example pull request 4", 3)), - issue.type = I(c(rep(list(list("issue" , "bug")), 13), rep(list(list("issue" , "bug")), 6), - rep(list(list("issue" , "bug")), 7), rep(list(list("issue", "bug", "enhancement")), 10), - rep(list(list("pull request")), 6), rep(list(list("pull request")), 4), rep(list(list("pull request", "enhancement")), 3))), - issue.state = c(rep("closed", 13), rep("open", 6), rep("closed", 7), rep("open", 10), - rep("reopened", 6), rep("closed", 4), rep("open", 3)), - issue.resolution = I(c(rep(list(list("fixed")), 13), rep(list(list("unresolved")), 6), - rep(list(list()), 7), rep(list(list()), 10), - rep(list(list()), 6), rep(list(list()), 4), rep(list(list()), 3))), - creation.date = get.date.from.string(c(rep("2013-04-21 23:52:09", 13), - rep("2016-07-12 16:01:30", 6), - rep("2016-07-12 15:59:25", 7), - rep("2016-07-12 14:30:13", 10), + issue.type = I(c(rep(list(list("issue" , "bug")), 14), rep(list(list("issue" , "bug")), 7), + rep(list(list("issue" , "bug")), 9), rep(list(list("issue", "bug", "enhancement")), 11), + rep(list(list("pull request")), 6), rep(list(list("pull request")), 5), rep(list(list("pull request", "enhancement")), 3))), + issue.state = c(rep("closed", 14), rep("open", 7), rep("closed", 9), rep("open", 11), + rep("reopened", 6), rep("closed", 5), rep("open", 3)), + issue.resolution = I(c(rep(list(list("fixed")), 14), rep(list(list("unresolved")), 7), + rep(list(list()), 9), rep(list(list()), 11), + rep(list(list()), 6), rep(list(list()), 5), rep(list(list()), 3))), + creation.date = get.date.from.string(c(rep("2013-04-21 23:52:09", 14), + rep("2016-07-12 16:01:30", 7), + rep("2016-07-12 15:59:25", 9), + rep("2016-07-12 14:30:13", 11), rep("2016-07-14 13:37:00", 6), - rep("2016-07-12 14:59:25", 4), + rep("2016-07-12 14:59:25", 5), rep("2016-07-12 16:02:02", 3))), - closing.date = get.date.from.string(c(rep("2013-05-25 20:02:08", 13), rep(NA, 6), - rep("2016-07-12 16:06:30", 7), rep(NA, 10), + closing.date = get.date.from.string(c(rep("2013-05-25 20:02:08", 14), rep(NA, 7), + rep("2016-07-12 16:06:30", 9), rep(NA, 11), rep(NA, 6), - rep("2016-07-12 16:04:59", 4), + rep("2016-07-12 16:04:59", 5), rep(NA, 3))), - issue.components = I(c(rep(list(list("GUI" , "Interpreters")), 13), rep(list(list("Interpreters")), 6), - rep(list(list()), 7), rep(list(list()), 10), - rep(list(list()), 6), rep(list(list()), 4), rep(list(list()), 3))), + issue.components = I(c(rep(list(list("GUI" , "Interpreters")), 14), rep(list(list("Interpreters")), 7), + rep(list(list()), 9), rep(list(list()), 11), + rep(list(list()), 6), rep(list(list()), 5), rep(list(list()), 3))), event.name = c("created", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", - "resolution_updated", "created", "commented", "commented", "commented", "commented", - "commented", "created", "assigned", "commented", "state_updated", "add_link", - "referenced", "referenced", "mentioned", "subscribed", "commented", "mentioned", - "subscribed", "add_link", "mentioned", "subscribed", "labeled", "commented", + "resolution_updated", "referenced_by", "add_link", "created", "commented", "commented", "commented", "commented", + "commented", "created", "assigned", "commented", "state_updated", "add_link", "referenced", + "referenced", "add_link", "add_link", "mentioned", "subscribed", "commented", "mentioned", + "subscribed", "add_link", "mentioned", "subscribed", "labeled", "commented", "referenced_by", "created", "commented", "state_updated", "commented", "commented", "state_updated", "created", "commented", "merged", "state_updated", - "commit_added", "created", "commented"), + "referenced_by", "commit_added", "created", "commented"), author.name = c("Thomas", "Thomas", "Björn", "Björn", "Björn", "Björn", "Olaf", "Björn", - "Björn", "Olaf", "Olaf", "Olaf", "Björn", "Björn", "Björn", "Björn", "Max", - "Max", "Max", "Karl", "Olaf", "Karl", "Olaf", "Karl", "Karl", "Thomas", "udo", + "Björn", "Olaf", "Olaf", "Olaf", "Björn", "Thomas", "Thomas", "Björn", "Björn", "Björn", "Max", + "Max", "Max", "Karl", "Olaf", "Karl", "Olaf", "Karl", "Karl", "Thomas", "Karl", "Thomas", "udo", "udo", "Thomas", "Björn", "Björn", "Thomas", "Björn", "Björn", "Olaf", "Björn", - "Thomas", "Thomas", "Thomas", "Olaf", "Björn", "Olaf", - "Björn", "Björn", "Olaf", "Olaf", "Björn", "Olaf", "Olaf"), + "Karl", "Thomas", "Thomas", "Thomas", "Olaf", "Björn", "Olaf", + "Björn", "Björn", "Olaf", "Olaf", "Thomas", "Björn", "Olaf", "Olaf"), author.email = c("thomas@example.org", "thomas@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", "olaf@example.org", "olaf@example.org", - "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", - "bjoern@example.org", "max@example.org", "max@example.org", + "bjoern@example.org", "thomas@example.org", "thomas@example.org", "bjoern@example.org", + "bjoern@example.org", "bjoern@example.org", "max@example.org", "max@example.org", "max@example.org", "karl@example.org", "olaf@example.org", - "karl@example.org", "olaf@example.org", "karl@example.org", - "karl@example.org", "thomas@example.org", "udo@example.org", + "karl@example.org", "olaf@example.org", "karl@example.org", "karl@example.org", + "thomas@example.org", "karl@example.org", "thomas@example.org", "udo@example.org", "udo@example.org", "thomas@example.org", "bjoern@example.org", - "bjoern@example.org", "thomas@example.org", "bjoern@example.org", - "bjoern@example.org", "olaf@example.org", "bjoern@example.org", + "bjoern@example.org", "thomas@example.org", "bjoern@example.org", "bjoern@example.org", + "olaf@example.org", "bjoern@example.org", "karl@example.org", "thomas@example.org", "thomas@example.org", "thomas@example.org", "olaf@example.org", "bjoern@example.org", "olaf@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", - "olaf@example.org", "bjoern@example.org", "olaf@example.org", - "olaf@example.org"), + "olaf@example.org", "thomas@example.org", "bjoern@example.org", + "olaf@example.org", "olaf@example.org"), date = get.date.from.string(c("2013-04-21 23:52:09", "2013-04-21 23:52:09", "2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", "2013-05-06 01:04:34", "2013-05-25 03:25:06", "2013-05-25 03:48:41", "2013-05-25 04:08:07", "2013-05-25 06:06:53", "2013-05-25 06:22:23", "2013-06-01 06:50:26", - "2013-06-01 06:53:06", "2016-07-12 16:01:30", + "2013-06-01 06:53:06", "2017-05-21 12:00:00", + "2017-05-21 12:00:00", "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52", "2016-07-12 15:59:25", "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-07-12 16:06:30", "2016-08-07 15:37:02", "2016-08-31 16:45:09", "2016-10-05 16:45:09", + "2016-08-07 15:37:02", "2016-08-07 15:30:00", "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2016-07-12 16:03:59", "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-10-13 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:31:34", "2017-05-23 12:32:39", - "2016-07-12 15:59:25", "2016-07-12 15:59:25", - "2016-07-12 15:59:59", "2016-07-12 16:01:01", - "2016-07-12 16:06:01", "2016-07-14 13:37:00", - "2016-07-12 14:59:25", "2016-07-12 14:59:25", - "2016-07-12 16:04:59", "2016-07-12 16:04:59", + "2016-08-07 15:37:02", "2016-07-12 15:59:25", + "2016-07-12 15:59:25", "2016-07-12 15:59:59", + "2016-07-12 16:01:01", "2016-07-12 16:06:01", + "2016-07-14 13:37:00", "2016-07-12 14:59:25", + "2016-07-12 14:59:25", "2016-07-12 16:04:59", + "2016-07-12 16:04:59", "2016-08-07 15:30:00", "2016-07-12 16:02:02", "2016-07-12 16:02:02", "2016-07-12 16:02:02")), event.info.1 = c("open", "open", "open", "open", "open", "open", "open", "open", "open", - "open", "open", "open", "fixed", "open", "open", "open", "open", "open", - "open", "open", "", "open", "closed", "930af63a030fb92e48eddff01f53284c3eeba80e", - "", "", "Thomas", "Thomas", "open", "Thomas", "Thomas", "fb52357f05958007b867da06f4077abdc04fa0d8", - "udo", "udo", "decided", "open", - "open", "open", "closed", "closed", "closed", "open", - "open", "open", "", "closed", - "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "open", "open"), + "open", "open", "open", "fixed", "", + "", "open", "open", "open", "open", "open", "open", "open", + "", "open", "closed", "930af63a030fb92e48eddff01f53284c3eeba80e", "", "", "", + "", "Thomas", "Thomas", "open", "Thomas", "Thomas", "fb52357f05958007b867da06f4077abdc04fa0d8", + "udo", "udo", "decided", "open", "", "open", "open", "closed", "closed", "closed", "open", + "open", "open", "", "closed", "", + "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "open", "open"), event.info.2 = NA, # is assigned later event.id = NA, # is assigned later - issue.source = c(rep("jira", 19), rep("github", 17), rep("github", 13)), + issue.source = c(rep("jira", 21), rep("github", 20), rep("github", 14)), artifact.type = "IssueEvent" ) @@ -459,13 +462,12 @@ test_that("Read and parse the issue data.", { list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), - "unresolved", list("unresolved"), list("unresolved"), list("unresolved"), + "unresolved", "issue", "issue", list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list(), "", list(), "open", - "commit", "", "", "thomas@example.org", "thomas@example.org", list(), - "thomas@example.org", "thomas@example.org", "commit", "udo@example.org", - "udo@example.org", "", list(), - list(), list(), "open", list(), list(), "closed", - list(), list(), "", "open", + "commit", "", "", "issue", "issue", "thomas@example.org", "thomas@example.org", list(), + "thomas@example.org", "thomas@example.org", "commit", "udo@example.org", "udo@example.org", + "", list(), "issue", list(), list(), "open", list(), list(), "closed", + list(), list(), "", "open", "issue", "2016-07-12 15:58:59", list(), list() )) diff --git a/tests/test-split-data-activity-based.R b/tests/test-split-data-activity-based.R index c654446f..c4879aa4 100644 --- a/tests/test-split-data-activity-based.R +++ b/tests/test-split-data-activity-based.R @@ -19,6 +19,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, activity-based splitting of data.") @@ -113,8 +114,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -192,7 +193,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(16:17, 22:25, 33, 42:46, 50:51, 53:55), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -306,8 +307,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$issues[0, ], "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 31:32, 48:49), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] ), mails = list( @@ -395,7 +396,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] ), mails = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails @@ -458,8 +459,9 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2013-04-21 23:52:09-2013-05-25 06:22:23", "2013-05-25 06:22:23-2016-07-12 15:59:59", "2016-07-12 15:59:59-2016-07-12 16:06:30", - "2016-07-12 16:06:30-2016-10-05 15:30:02", - "2016-10-05 15:30:02-2017-05-23 12:32:40" + "2016-07-12 16:06:30-2016-08-07 15:37:02", + "2016-08-07 15:37:02-2017-05-23 12:31:34", + "2017-05-23 12:31:34-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -476,7 +478,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.basis = "issues", split.sliding.window = FALSE, split.revisions = c("2013-04-21 23:52:09", "2013-05-25 06:22:23", "2016-07-12 15:59:59", - "2016-07-12 16:06:30", "2016-10-05 15:30:02", "2017-05-23 12:32:40"), + "2016-07-12 16:06:30", "2016-08-07 15:37:02", "2017-05-23 12:31:34", + "2017-05-23 12:32:40"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -491,44 +494,50 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commits[0, ], "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commits[1, ], "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commits[2:5, ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commits[6:8, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commits[6:8, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commits[0, ], + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commits[0, ] ), commit.messages = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commit.messages, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commit.messages, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commit.messages, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commit.messages, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commit.messages, + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 20:21, 27:28, 43:44, 37:38), ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(14:15, 22, 29, 39:41, 45:49), ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 30, 42), ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 22:24, 31:32, 42:43, 48:49), ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(16:17, 33, 44:46, 50:51, 53:55), ], + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(18:21, 25:26, 29:30, 47, 52), ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:15, 27:28, 34:38, 41), ], + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(39:40), ] ), mails = list( ## comments indicate row names when pasta is not configured "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$mails[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 15:16 + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$mails[0, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$mails[0, ], + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$pasta, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$pasta, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$pasta, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$pasta, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$pasta, + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$synchronicity, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$synchronicity, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$synchronicity, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$synchronicity, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$synchronicity, + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -569,7 +578,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 59, + split.length = 65, split.basis = "issues", split.sliding.window = FALSE, split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), @@ -696,10 +705,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29, 40:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -761,7 +770,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 18, split.basis = "commits", - split.sliding.window = TRUE, + split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), split.revision.dates = NULL ) @@ -780,7 +789,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(16:17, 22:25, 33, 42:46, 50:51, 53:55), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -847,7 +856,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10", "2016-07-12 16:00:45-2016-07-12 16:06:20", "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:20-2016-07-12 16:06:33", + "2016-07-12 16:06:20-2016-07-12 16:06:32", "2016-07-12 16:06:32-2016-07-12 16:06:33" ) lapply(results, function(res) { @@ -866,7 +875,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.basis = "commits", split.sliding.window = TRUE, split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), split.revision.dates = NULL ) @@ -882,21 +891,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commits[5:8, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:9, ] ), commit.messages = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commit.messages, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29, 40:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -904,21 +913,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$mails[0, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$synchronicity, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity ) ) @@ -1041,10 +1050,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$issues[0, ], "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$issues[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], - "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ] + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 31:32, 48:49), ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 31:32, 48:49), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1124,7 +1133,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.type = "activity-based", split.length = 26, split.basis = "mails", - split.sliding.window = TRUE, + split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), split.revision.dates = NULL ) @@ -1143,7 +1152,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:49), ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] ), mails = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails @@ -1209,9 +1218,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02", "2016-07-12 15:59:59-2016-07-12 16:06:30", "2016-07-12 16:02:02-2016-07-27 20:12:08", - "2016-07-12 16:06:30-2016-10-05 15:30:02", - "2016-07-27 20:12:08-2017-05-23 12:31:34", - "2016-10-05 15:30:02-2017-05-23 12:32:40" + "2016-07-12 16:06:30-2016-08-07 15:37:02", + "2016-07-27 20:12:08-2016-10-05 16:45:09", + "2016-08-07 15:37:02-2017-05-23 12:31:34", + "2016-10-05 16:45:09-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -1229,8 +1239,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity split.sliding.window = TRUE, split.revisions = c("2013-04-21 23:52:09", "2013-05-06 01:04:34", "2013-05-25 06:22:23", "2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 16:02:02", - "2016-07-12 16:06:30", "2016-07-27 20:12:08", "2016-10-05 15:30:02", - "2017-05-23 12:31:34", "2017-05-23 12:32:40"), + "2016-07-12 16:06:30", "2016-07-27 20:12:08", "2016-08-07 15:37:02", + "2016-10-05 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -1248,9 +1258,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$commits[1:2, ], "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commits[2:5, ], "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$commits[3:8, ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commits[6:8, ], - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commits[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commits[6:8, ], + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$commits[0, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commits[0, ], + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$commits[0, ] ), commit.messages = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, @@ -1259,32 +1270,35 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$commit.messages, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commit.messages, "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$commit.messages, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commit.messages, - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commit.messages, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commit.messages, + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$commit.messages, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commit.messages, + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], - "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$issues[rownames(data$issues) %in% c(6:13, 43:44), ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 20:21, 27:28, 37:38, 43:44), ], - "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(14, 20:22, 27:28, 37:40),], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(14:15, 22, 29, 39:41, 45:49), ], - "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(15:17, 23, 29, 41:42, 45:49),], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 30, 42), ], - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 30:34), ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] + "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$issues[rownames(data$issues) %in% c(6:13, 48:49), ], + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 22:24, 31:32, 42:43, 48:49), ], + "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(16, 22:24, 31:32, 42:45, 53), ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(16:17, 33, 44:46, 50:51, 53:55), ], + "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(17:19, 25, 33, 46:47, 50:51, 54:55), ], + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(18:21, 25:26, 29:30, 47, 52), ], + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$issues[rownames(data$issues) %in% c(20:21, 26:27, 29:30, 34:35, 41, 52), ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:15, 27:28, 34:38, 41), ], + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:15, 28, 36:40), ] ), mails = list( ## comments indicate row names when pasta is not configured "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$mails[0, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 - "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$mails[0, ], - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$mails[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 15:16 + "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$mails[15:16, ], # rownames(data$mails) %in% 15:16 + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$mails[0, ], + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$mails[0, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$mails[0, ], + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, @@ -1293,9 +1307,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$pasta, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$pasta, "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$pasta, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$pasta, - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$pasta, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$pasta, + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$pasta, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$pasta, + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, @@ -1304,9 +1319,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$synchronicity, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$synchronicity, "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$synchronicity, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$synchronicity, - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$synchronicity, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$synchronicity, + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$synchronicity, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$synchronicity, + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -1347,9 +1363,9 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 59, + split.length = 65, split.basis = "issues", - split.sliding.window = TRUE, + split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), split.revision.dates = NULL ) @@ -1468,8 +1484,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[15:16, ], # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -1595,7 +1611,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ), issues = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1683,8 +1699,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ## check time ranges expected = c( - "2013-04-21 23:52:09-2016-07-12 16:02:02", - "2016-07-12 16:02:02-2017-05-23 12:32:40" + "2013-04-21 23:52:09-2016-07-12 16:03:59", + "2016-07-12 16:03:59-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, @@ -1698,10 +1714,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 21, + split.length = 24, split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2016-07-12 16:02:02", "2017-05-23 12:32:40"), + split.revisions = c("2013-04-21 23:52:09", "2016-07-12 16:03:59", "2017-05-23 12:32:40"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -1713,29 +1729,29 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ## check data for all ranges expected.data = list( commits = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commits[1:2, ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commits[3:8, ] + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$commits[1:2, ], + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$commits[3:8, ] ), commit.messages = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commit.messages, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commit.messages + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$commit.messages, + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$commit.messages ), issues = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28, 37:40, 43:44), ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36, 41:42, 45:49), ] + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:32, 42:45, 48:49, 53:55), ], + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:15, 18:21, 26:29, 30, 33:40, 25, 41, 46:47, 50:52), ] ), mails = list( ## comments indicate row names when pasta is not configured - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$mails[15:16, ] # rownames(data$maisl) %in% 16:17 + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$mails[15:16, ] # rownames(data$maisl) %in% 15:16 ), pasta = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$pasta, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$pasta + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$pasta, + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$synchronicity, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$synchronicity + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$synchronicity, + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( diff --git a/tests/test-split-data-time-based.R b/tests/test-split-data-time-based.R index 3f28a790..92853da3 100644 --- a/tests/test-split-data-time-based.R +++ b/tests/test-split-data-time-based.R @@ -20,6 +20,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, time-based splitting of data.") @@ -113,9 +114,9 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15,29, 47:49), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45:46), ] + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(16, 22:24, 42:45), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(17, 33, 53:55), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(25, 46, 50:51), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -230,14 +231,14 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] ), mails = list( ## comments indicate row names when pasta is not configured "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:16 ), pasta = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, @@ -341,12 +342,12 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis ), issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 35:36, ] + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ], + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:15, 39:40), ] ), mails = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # when pasta is not configured: rownames(data$mails) %in% 14:17 + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # when pasta is not configured: rownames(data$mails) %in% 13:16 "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( @@ -458,19 +459,19 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(14:15, 40, 47:49), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15, 29, 47:49), ], - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(29,41,45,46), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45,46), ] + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(16, 22:24, 42:45), ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(16:17, 45, 53:55), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(17, 33, 53:55), ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(33, 46, 50:51), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(25, 46, 50:51), ] ), mails = list( ## comments indicate row names when pasta is not configured "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[15, ], # rownames(data$mails) == 16 - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[15:16, ], # rownames(data$mails) %in% c(16,17) - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[16, ] # rownames(data$mails) == 17 + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[15, ], # rownames(data$mails) == 15 + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[15:16, ], # rownames(data$mails) %in% c(15,16) + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[16, ] # rownames(data$mails) == 16 ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, @@ -597,17 +598,17 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$issues[0, ], "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] ), mails = list( ## comments indicate row names when pasta is not configured - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:16 ), pasta = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, @@ -723,15 +724,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$issues[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:36, 37:49), ] + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:55), ] ), mails = list( ## comments indicate row names when pasta is not configured "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # rownames(data$mails) %in% 14:17 - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[13:16, ] # rownames(data$mails) %in% 14:17 + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # rownames(data$mails) %in% 13:17 + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 ), pasta = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, @@ -830,7 +831,7 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... ) "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages ), issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ] + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ] ), mails = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ] @@ -927,8 +928,8 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... , "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commit.messages ), issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% 35:36, ] + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% c(14:15, 39:40), ] ), mails = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], @@ -960,6 +961,44 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... , "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) )) +## +## Verify that split.data.time.based does not accept an invalid bins parameter +## + +test_that("Split a data object time-based with invalid bins parameter.", { + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## define invalid bins + invalid.bins.not.a.date = c("These", "bins", "are", "invalid") + invalid.bins.contains.NA = c("2016-01-01 00:00:00", NA, "2016-12-31 23:59:59", "2017-06-03 03:03:03") + invalid.bins.not.a.list = "2016-01-01 00:00:00 2016-12-31 23:59:59" + invalid.bins.format.of.split.by.bins = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), vector = replicate(24, 1)) + + invalid.bins = list(invalid.bins.not.a.date, invalid.bins.contains.NA, invalid.bins.not.a.list, + invalid.bins.format.of.split.by.bins) + + ## test that all invalid bins produce an error + for (invalid.bin in invalid.bins) { + expect_error(split.data.time.based(project.data, bins = invalid.bin, split.basis = "issues"), + regexp = "Stopped due to incorrect parameter types", + info = "Bins need to be a list of characters representing dates.") + } +}) + + ## * * custom event timestamps ---------------------------------------------------------------- ## @@ -1036,16 +1075,16 @@ patrick::with_parameters_test_that("Split a data object time-based using custom "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$commit.messages ), issues = list( - "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$issues[rownames(data$issues) %in% c(20:22, 27, 28, 37:39), ], - "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$issues[rownames(data$issues) %in% c(14, 15, 29, 40, 45:49), ], - "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$issues[rownames(data$issues) %in% c(16:19, 23:24, 41, 42), ], - "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$issues[rownames(data$issues) %in% c(25, 30), ] + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$issues[rownames(data$issues) %in% c(22:24, 31:32, 42:44), ], + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45, 50:51, 53:55), ], + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$issues[rownames(data$issues) %in% c(18:21, 25:26, 29:30, 41, 46:47, 52), ], + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$issues[rownames(data$issues) %in% c(27, 34), ] ), mails = list( ## comments indicate rownames when pasta is not configured - "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$mails[15, ], # rownames(data$mails) %in% 16 - "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$mails[16, ], # rownames(data$mails) %in% 17 + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$mails[15, ], # rownames(data$mails) %in% 15 + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$mails[16, ], # rownames(data$mails) %in% 16 "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$mails[0, ] ), pasta = list( @@ -1272,14 +1311,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$issues[rownames(data$issues) %in% c(20:22, 37:40), ], - "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$issues[rownames(data$issues) %in% c(14, 15, 29, 47:49), ], - "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23, 41, 45:46), ] + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$issues[rownames(data$issues) %in% c(22:24, 42:45), ], + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$issues[rownames(data$issues) %in% c(16:17, 33, 53:55), ], + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(25, 46, 50:51), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$mails[0, ], "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$mails[0, ], - "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 15:16 ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$pasta, @@ -1388,14 +1427,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$issues[0, ], "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$issues[0, ], "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] ), mails = list( ## comments indicate row names when pasta is not configured "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$mails[1:2, ], "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$mails[0, ], - "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:16 ), pasta = list( "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$pasta, @@ -1500,12 +1539,12 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si issues = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$issues[rownames(data$issues) %in% 1:13, ], "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$issues[0, ], - "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:49, ] + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:55, ] ), mails = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$mails[0, ], "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$mails[0, ], - "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$mails[13:16, ] # when pasta is not configured: rownames(data$mails) %in% 13:17 + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$mails[13:16, ] # when pasta is not configured: rownames(data$mails) %in% 13:16 ), pasta = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$pasta, diff --git a/tests/test-split-misc.R b/tests/test-split-misc.R index c2a9c723..7a2e42b6 100644 --- a/tests/test-split-misc.R +++ b/tests/test-split-misc.R @@ -14,6 +14,7 @@ ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2018 by Jakob Kronawitter ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -42,10 +43,10 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## Split raw data (data and networks by bins) ------------------------------ ## -## Tests for split.data.by.bins and split.network.by.bins +## Tests for split.dataframe.by.bins and split.network.by.bins ## -test_that("Split network and data on low level (split.data.by.bins, split.network.by.bins).", { +test_that("Split network and data on low level (split.dataframe.by.bins, split.network.by.bins).", { length.dates = 15 length.bins = 5 @@ -69,7 +70,7 @@ test_that("Split network and data on low level (split.data.by.bins, split.networ ## sprintf("c(\"%s\")", paste( sample(bins, size = length.dates, replace = TRUE), collapse = "', '") ) ## - ## split.data.by.bins + ## split.dataframe.by.bins ## ## generate data frame with dates and IDs @@ -86,7 +87,7 @@ test_that("Split network and data on low level (split.data.by.bins, split.networ "4" = df[ c(4, 11, 13), ], "5" = df[ c(3, 10, 15), ] ) - results = split.data.by.bins(df, bins.vector) + results = split.dataframe.by.bins(df, bins.vector) ## check result expect_equal(results, expected, info = "Split data by bins.") @@ -124,6 +125,50 @@ test_that("Split network and data on low level (split.data.by.bins, split.networ }) +## +## Verify that split.data.by.bins does not accept an invalid bins parameter +## + +test_that("Split a data object by activity-based bins with invalid bins parameter.", { + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## define invalid bins + invalid.bins.not.a.date = list(bins = c("These", "bins", "are", "invalid"), vector = replicate(24, 1)) + invalid.bins.not.a.number = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), vector = replicate(24, "NaN")) + invalid.bins.contains.NA = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40", NA), vector = replicate(24, 1)) + invalid.bins.missing.bins = list(vector = replicate(24, 1)) + invalid.bins.missing.vector = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40")) + invalid.bins.format.of.split.time.based = list("2013-04-21 23:52:09", "2017-05-23 12:32:40") + + invalid.bins = list(invalid.bins.not.a.date, invalid.bins.contains.NA, invalid.bins.missing.bins, + invalid.bins.missing.vector, invalid.bins.format.of.split.time.based) + + ## test that all invalid bins produce an error + for (invalid.bin in invalid.bins) { + expect_error(split.data.by.bins(project.data, + bins = invalid.bin, + split.basis = "issues", + activity.amount = 3000, + sliding.window = FALSE), + regexp = "Stopped due to incorrect parameter types", + info = "Bins need to be a named list with a 'bins' component including characters representing dates" + + " and a 'vector' including numerics.") + } +}) + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Bin identification ------------------------------------------------------ diff --git a/util-misc.R b/util-misc.R index 152f13ca..88817a61 100644 --- a/util-misc.R +++ b/util-misc.R @@ -998,7 +998,7 @@ get.data.from.range = function(range, data) { ## split data by this bin; this gives a list of three data frames, "0" contains the data before the range, "1" the ## data within the range and "2" the holds the data after the range - split.data = split.data.by.bins(data, df.bins) + split.data = split.dataframe.by.bins(data, df.bins) ## look for the element with name "1", as we are interested in the data within the range ## if there is no data, return an empty data frame corresponding to the data we want to cut diff --git a/util-networks.R b/util-networks.R index 9f205bba..7fa76f4f 100644 --- a/util-networks.R +++ b/util-networks.R @@ -475,17 +475,71 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", return(private$artifacts.network.issue) } - ## log warning as we do not have relations among issues right now - logging::logwarn(paste( - "There exist no actual artifact network with the relation 'issue'.", - "Return an edge-less network now." - )) + if (private$proj.data$get.project.conf()$get.entry("issues.only.comments")) { + logging::logwarn(paste( + "Create an edge-less artifact network as 'issues.only.comments' is set.", + "Comments in issues cannot create issue edges." + )) + } - ## construct edgeless network with mandatory edge and vertex attributes - directed = private$network.conf$get.value("artifact.directed") - artifacts = private$proj.data$get.artifacts("issues") # issue IDs - artifacts.net = create.empty.network(directed = directed, add.attributes = TRUE) + - igraph::vertices(artifacts) + ## construct edge list based on issue-artifact data + artifacts.net.data.raw = private$proj.data[[DATASOURCE.TO.ARTIFACT.FUNCTION[["issues"]]]]() + + ## obtain issue-connecting events + add.links = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "add_link" & + artifacts.net.data.raw$event.info.2 == "issue", ] + referenced.bys = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "referenced_by" & + artifacts.net.data.raw$event.info.2 == "issue", ] + + if (nrow(add.links) != nrow(referenced.bys)) { + logging::logwarn("Inconsistent issue data. Unequally many 'add_link' and 'referenced_by' issue-events.") + } + + vertices = unique(artifacts.net.data.raw["issue.id"]) + edge.list = data.frame() + + # edges in artifact networks can not have the 'artifact' attribute but should instead have + # the 'author.name' attribute as events caused by authors connect issues + edge.attributes = private$network.conf$get.value("edge.attributes") + artifact.index = match("artifact", edge.attributes, nomatch = NA) + if (!is.na(artifact.index)) { + edge.attributes = edge.attributes[-artifact.index] + edge.attributes = c(edge.attributes, c("author.name")) + } + + ## connect corresponding add_link and referenced_by issue-events + edge.list = plyr::rbind.fill(parallel::mclapply(split(add.links, seq_along(add.links)), function(from) { + ## get edge attributes + cols.which = edge.attributes %in% colnames(from) + edge.attrs = from[, edge.attributes[cols.which], drop = FALSE] + + ## construct edge + to = subset(referenced.bys, + event.info.1 == from[["issue.id"]] & + author.name == from[["author.name"]] & + date == from[["date"]]) + if (!all(is.na(to))) { + combination = list("from" = from[["issue.id"]], "to" = to[["issue.id"]]) + combination = cbind(combination, edge.attrs, row.names = NULL) # add edge attributes + return(combination) # return the combination for this row + } + })) + + artifacts.net.data = list( + vertices = data.frame( + name = vertices + ), + edges = edge.list + ) + + ## construct network from obtained data + artifacts.net = construct.network.from.edge.list( + artifacts.net.data[["vertices"]], + artifacts.net.data[["edges"]], + network.conf = private$network.conf, + directed = private$network.conf$get.value("artifact.directed"), + available.edge.attributes = private$proj.data$get.data.columns.for.data.source("issues") + ) ## store network private$artifacts.network.issue = artifacts.net @@ -899,6 +953,11 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", artifacts.to.add.kind = artifacts.all[ artifacts.all[["data.vertices"]] %in% artifacts.to.add, "artifact.type" ] + + ## Adjust vertex attribute to 'Issue' in multi networks + ## to be consistent with bipartite networks + artifacts.to.add.kind[artifacts.to.add.kind == "IssueEvent"] = "Issue" + artifacts.net = artifacts.net + igraph::vertices(artifacts.to.add, type = TYPE.ARTIFACT, kind = artifacts.to.add.kind) diff --git a/util-read.R b/util-read.R index 8f1b4fd9..25c3a87d 100644 --- a/util-read.R +++ b/util-read.R @@ -375,7 +375,7 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { } ## set pattern for issue ID for better recognition - issue.data[["issue.id"]] = sprintf("", issue.data[["issue.source"]], issue.data[["issue.id"]]) + issue.data[["issue.id"]] = sprintf(ISSUE.ID.FORMAT, issue.data[["issue.source"]], issue.data[["issue.id"]]) ## properly parse and store data in list-type columns issue.data[["issue.type"]] = I(unname(lapply(issue.data[["issue.type"]], jsonlite::fromJSON, simplifyVector = FALSE))) @@ -388,6 +388,13 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { issue.data[["creation.date"]] = get.date.from.string(issue.data[["creation.date"]]) issue.data[["closing.date"]] = get.date.from.string(issue.data[["closing.date"]]) + ## if other issues are referenced, convert names to ID format + matches = issue.data[issue.data[["event.name"]] %in% c("add_link", "remove_link", "referenced_by") & + issue.data[["event.info.2"]] == "issue", ] + formatted.matches = sprintf(ISSUE.ID.FORMAT, matches[["issue.source"]], matches[["event.info.1"]]) + issue.data[issue.data[["event.name"]] %in% c("add_link", "remove_link", "referenced_by") & + issue.data[["event.info.2"]] == "issue", ][["event.info.1"]] = formatted.matches + if (nrow(issue.data) > 0) { ## fix all dates to be after the creation date ## violations can happen for "commit_added" events if the commit was made before the PR was opened @@ -965,4 +972,7 @@ format.commit.ids = function(commit.ids) { return (sprintf(COMMIT.ID.FORMAT, commit.ids)) } +## declare a global format for issue.ids in several data frame columns +ISSUE.ID.FORMAT = "" + diff --git a/util-split.R b/util-split.R index 1c0ea9e9..62db7f8a 100644 --- a/util-split.R +++ b/util-split.R @@ -22,6 +22,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -64,183 +65,68 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = number.windows = NULL, split.basis = c("commits", "mails", "issues"), sliding.window = FALSE, project.conf.new = NULL) { - ## get basis for splitting process - split.basis = match.arg(split.basis) - - ## if the data used by the split basis is not present, load it automatically - if (!(split.basis %in% project.data$get.cached.data.sources("only.unfiltered"))) { - function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[split.basis]] - project.data[[function.name]]() + # validate existence and type of the 'bins' parameter + if (!is.null(bins) && !lubridate::is.POSIXct(bins)) { + dates = parallel::mclapply(unlist(bins), get.date.from.string) + if (any(is.na(dates))) { + logging::logerror(paste("The bins parameter, if present, needs to be a vector", + "whose elements represent dates")) + stop("Stopped due to incorrect parameter types") + } } - ## get actual raw data - data.to.split = project.data$get.cached.data.sources("only.unfiltered") - - data = lapply(data.to.split, function(ds) { - ## build the name of the respective getter and call it - function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[ds]] - return(project.data[[function.name]]()) - }) - names(data) = data.to.split - - ## load available additional data sources - additional.data.sources = project.data$get.cached.data.sources("only.additional") - additional.data = lapply(additional.data.sources, function(ds) { - ## build the name of the respective getter and call it - function.name = DATASOURCE.TO.ADDITIONAL.ARTIFACT.FUNCTION[[ds]] - return(project.data[[function.name]]()) - }) - names(additional.data) = additional.data.sources + split = split.data.by.time.or.bins(project.data, splitting.length = time.period, bins, split.by.time = TRUE, + number.windows, split.basis, sliding.window, project.conf.new) + return(split) +} - ## number of windows given (ignoring time period and bins) - if (!is.null(number.windows)) { - ## reset bins for the later algorithm - bins = NULL - ## remove sliding windows - sliding.window = FALSE - } - ## if bins are NOT given explicitly - if (is.null(bins)) { - ## get bins based on split.basis - bins = split.get.bins.time.based(data[[split.basis]][["date"]], time.period, number.windows)$bins - bins.labels = head(bins, -1) - split.by.bins = FALSE - ## logging - logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.", - project.data$get.class.name(), time.period, split.basis) - } - ## when bins are given explicitly - else { - ## remove sliding windows - sliding.window = FALSE - ## get bins based on parameter - split.basis = NULL - bins = get.date.from.string(bins) - bins = get.date.string(bins) - bins.labels = head(bins, -1) - split.by.bins = TRUE - ## logging - logging::loginfo("Splitting data '%s' into time ranges [%s].", - project.data$get.class.name(), paste(bins, collapse = ", ")) +#' Split project data in activity-bin-based ranges as specified +#' +#' @param project.data the project data object from which the data is retrieved +#' @param activity.amount the amount of data elements with unique ids to be considered in a bin, an integer. +#' @param bins the bins by which data should be split. Comprises of two components: +#' \code{vector}: Assigns elements of the \code{split.basis} column of \code{project.data} to bins. +#' \code{bins}: Dates defining the start of bins (the last date defines the end of the last bin, in an +#' *exclusive* manner). +#' The expected format of \code{bins} is produced by \code{split.get.bins.activity.based}. +#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' [default: "commits"] +#' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}. +#' +#' @return the list of RangeData objects, each referring to one bin +#' +#' @seealso split.get.bins.activity.based +split.data.by.bins = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"), + sliding.window) { + + # validate type of the 'bins' parameter + if (is.null(bins) || !is.list(bins)) { + logging::logerror("The bins parameter needs to be of type list, (is %s)", class(bins)) + stop("Stopped due to incorrect parameter types") } - bins.date = get.date.from.string(bins) - - ## construct ranges - bins.ranges = construct.ranges(bins) - names(bins.ranges) = bins.ranges - if ((length(bins.ranges) <= 1) && sliding.window) { - logging::logwarn("Sliding-window approach does not apply for one range or less.") - sliding.window = FALSE + # validate existence and type of the 'bins' component of the 'bins' parameter + if (!("bins" %in% names(bins))) { + logging::logerror("The 'bins' parameter needs to include a component 'bins'") + stop("Stopped due to incorrect parameter types") } - if (is.null(project.conf.new)) { - ## Clone the project configuration, so that splitting repeatedly does not interfere - ## with the same configuration. - project.conf.new = project.data$get.project.conf()$clone() + dates = parallel::mclapply(bins[["bins"]], get.date.from.string) + if (any(is.na(dates))) { + logging::logerror(paste("The 'bins' component of the 'bins' parameter, needs to be a vector", + "whose elements represent dates")) + stop("Stopped due to incorrect parameter types") } - if (!sliding.window) { - ## split data - data.split = parallel::mclapply(data.to.split, function(df.name) { - logging::logdebug("Splitting %s.", df.name) - ## identify bins for data - df = data[[df.name]] - df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE) - ## split data according to df.bins - df.split = split(df, df.bins) - ## add proper labels/names - names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) - return(df.split) - }) - ## set the names to the data sources obtained earlier - names(data.split) = data.to.split - - ## re-arrange data to get the proper list of data per range - logging::logdebug("Re-arranging data.") - data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) - names(data.split) = bins.ranges - - ## adapt project configuration - project.conf.new$set.revisions(bins, bins.date) - - ## construct RangeData objects - logging::logdebug("Constructing RangeData objects.") - - cf.data = parallel::mclapply(bins.ranges, function(range) { - logging::logdebug("Constructing data for range %s.", range) - ## construct object for current range - cf.range.data = RangeData$new(project.conf.new, range) - ## get data for current range - df.list = data.split[[range]] - - ## set main data sources: commits, mails, issues - for (data.source in data.to.split) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](df.list[[data.source]]) - } - ## set additional data sources: authors, commit.messages, pasta, synchronicity - for (data.source in additional.data.sources) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](additional.data[[data.source]]) - } - - return(cf.range.data) - }) - - } else { - ## perform different steps for sliding-window approach - - ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = FALSE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = TRUE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) - bins = get.date.string(bins.date) - - logging::loginfo("Splitting data '%s' into time ranges using sliding windows [%s].", - project.data$get.class.name(), ranges) - cf.data = split.data.time.based.by.ranges(project.data, ranges) - - ## update project configuration - project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) - for (cf in cf.data) { - ## re-set project configuration due to object duplication - cf.conf = cf$set.project.conf(project.conf.new) - } + # validate existence and type of the 'vector' component of the 'bins' parameter + if (!inherits(bins[["vector"]], "numeric")) { + logging::logerror("The 'vector' component of the bins parameter needs to be a numeric vector") + stop("Stopped due to incorrect parameter types") } - ## add splitting information to project configuration - project.conf.new$set.splitting.info( - type = "time-based", - length = if (split.by.bins) { - bins - } - else { - if (!is.null(number.windows)) { - as.character(lubridate::as.period( - get.time.period.by.amount( - min(data[[split.basis]][["date"]]), - max(data[[split.basis]][["date"]]), - number.windows - ) - )) - } - else time.period - }, - basis = split.basis, - sliding.window = sliding.window, - revisions = bins, - revisions.dates = bins.date - ) - - ## set bin attribute - attr(cf.data, "bins") = bins.date - - ## return list of RangeData objects - return(cf.data) + split = split.data.by.time.or.bins(project.data, activity.amount, bins, split.by.time = FALSE, + sliding.window = sliding.window, split.basis = split.basis) + return(split) } #' Split project data by timestamps @@ -360,17 +246,19 @@ split.data.activity.based = function(project.data, activity.type = c("commits", logging::loginfo("Splitting data '%s' into activity ranges of %s %s (%s windows).", project.data$get.class.name(), activity.amount, activity.type, number.windows) - ## get bins based on split.basis + ## get bins based on 'split.basis'. Here the 'include.duplicate.ids' parameter flag must be set, to + ## retrieve bins which map every event to a bin including events with non-unique ids. This is important + ## to ensure that every range really has 'activity.amount' many entries after splitting logging::logdebug("Getting activity-based bins.") bins.data = split.get.bins.activity.based(data[[activity.type]], id.column[[activity.type]], - activity.amount, remove.duplicate.bins = TRUE) + activity.amount, remove.duplicate.bins = TRUE, include.duplicate.ids = TRUE) bins = bins.data[["bins"]] bins.date = get.date.from.string(bins) ## split the data based on the extracted timestamps logging::logdebug("Splitting data based on time windows arising from activity bins.") - cf.data = split.data.time.based(project.data, bins = bins.date, split.basis = activity.type, - project.conf.new = project.conf.new) + cf.data = split.data.by.bins(project.data, bins = bins.data, activity.amount = activity.amount, + sliding.window = sliding.window, split.basis = activity.type) ## perform additional steps for sliding-window approach: ## for activity-based sliding-window bins to work, we need to crop the data appropriately and, @@ -384,23 +272,9 @@ split.data.activity.based = function(project.data, activity.type = c("commits", items.unique = unique(data[[activity.type]][[ id.column[[activity.type]] ]]) items.unique.count = length(items.unique) - ## offsets used for cropping (half the first/last bin) + ## offsets used for cropping (half of the first bin) offset.start = floor(activity.amount / 2) - offset.end = (items.unique.count - offset.start) %% activity.amount - ## cut the data appropriately - if (offset.end > 0) { - items.cut = c( - items.unique[seq_len(offset.start)], - items.unique[seq(from = (items.unique.count - offset.end + 1), to = items.unique.count)] - ) - } else { - items.cut = items.unique[seq_len(offset.start)] - } - - ## determine end bin of last sliding-window range - end.event.id = items.unique[(items.unique.count - offset.end + 1)] - end.event.logical = (data[[activity.type]][[ id.column[[activity.type]] ]] == end.event.id) - end.event.date = unique(data[[activity.type]][end.event.logical, ][["date"]]) + items.cut = items.unique[seq_len(offset.start)] ## store the data again data.to.cut = data[[activity.type]][[ id.column[[activity.type]] ]] %in% items.cut @@ -417,12 +291,34 @@ split.data.activity.based = function(project.data, activity.type = c("commits", activity.amount = activity.amount, sliding.window = FALSE, project.conf.new = project.conf.new) + ## extract bins + bins.date.middle = attr(cf.data.sliding, "bins") + + ## Both, the last sliding range and the last regular range end at the very last item. + ## This is the case because the end of the data is never cropped (like the beginning is). + ## split.data.activity.based, which is invoked to obtain both set of ranges, creates + ## ranges until all elements are in one. + ## + ## The conditional below inspects whether the very last item is in the first or the second + ## half of the last regular range. If it is in the first half, there will be a sliding + ## window which covers all items of the last regular range which makes the last regular + ## range obsolete. + ## Similarely if the last item is in the second half of the last regular range, there + ## will be a sliding range (which started at the half of the last regular range) which + ## contains only items also included in the last regular range, which makes the sliding + ## range obsolete. + if ((items.unique.count %% activity.amount) > offset.start) { + cf.data.sliding = cf.data.sliding[-length(cf.data.sliding)] + bins.date.middle = bins.date.middle[-length(bins.date.middle)] + } else { + cf.data = cf.data[-length(cf.data)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } + ## append data to normally-split data cf.data = append(cf.data, cf.data.sliding) - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = attr(cf.data.sliding, "bins") - ## sort data object properly by bin starts bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) cf.data = cf.data[ order(bins.ranges.start) ] @@ -431,38 +327,6 @@ split.data.activity.based = function(project.data, activity.type = c("commits", bins.date = sort(c(bins.date, bins.date.middle)) bins = get.date.string(bins.date) - ## if the last regular range and the last sliding-window range end at the same time - ## and the data of the last regular range is contained in the last sliding-window range, then: - ## remove the last regular range as it is not complete and we don't loose data when removing it - last.regular.range = cf.data[[length(cf.data)]] - last.sliding.range = cf.data[[length(cf.data) - 1]] - get.activity.data = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]] - - last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - last.sliding.range.ids = (last.sliding.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] - && all(last.regular.range.ids %in% last.sliding.range.ids) ) { - - cf.data = cf.data[-length(cf.data)] - bins.date = bins.date[-length(bins.date)] - bins = bins[-length(bins)] - } else if (bins.date[length(bins.date)] != bins.date.middle[length(bins.date.middle)]) { - ## adjust the end date of the last sliding-window range, as it might be shorter than it should be: - ## The end of the last range usually is one second after the last event (as end dates are exclusive). - ## In case of sliding windows, the end of the last sliding range needs to be extended to the date of the - ## next event after that range (as end dates are exclusive) to get a full range as for all the previous - ## ranges which end at the beginning of the next range, which is the date of the first event after the - ## actual range. - - ## When we have sliding windows, there are, at least, three ranges (two regular ranges and one - ## sliding-window range. Hence, there are always more than three elements in the bins vector, so accessing - ## bins[length(bins) - 3] cannot throw errors in this case. - name.last.sliding.window = construct.ranges(c(bins[length(bins) - 3], get.date.string(end.event.date))) - names(cf.data)[length(cf.data) - 1] = name.last.sliding.window - bins.date[length(bins.date) - 1] = end.event.date - bins[length(bins) - 1] = get.date.string(end.event.date) - } - ## update project configuration project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { @@ -973,16 +837,16 @@ split.network.time.based.by.ranges = function(network, ranges, remove.isolates = ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split raw data ---------------------------------------------------------- -#' Split the given data by the given bins. +#' Split the given datafame by the given bins. #' #' @param df a data.frame to be split #' @param bins a vector with the length of 'nrow(df)' assigning a bin for each row of 'df' #' #' @return a list of data.frames, with the length of 'unique(bins)' -split.data.by.bins = function(df, bins) { - logging::logdebug("split.data.by.bins: starting.") +split.dataframe.by.bins = function(df, bins) { + logging::logdebug("split.dataframe.by.bins: starting.") df.split = split(df, bins) - logging::logdebug("split.data.by.bins: finished.") + logging::logdebug("split.dataframe.by.bins: finished.") return(df.split) } @@ -1010,6 +874,227 @@ split.network.by.bins = function(network, bins, bins.vector, remove.isolates = T } +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Internal helper functions for data splitting ---------------------------- + +#' Split project data in time-based or activity-bin-based ranges as specified +#' +#' @param project.data the *Data object from which the data is retrieved +#' @param splitting.length either \code{time.period} from \code{split.data.time.based} +#' or \code{activity.amount} from \code{split.data.by.bins} +#' @param bins either formatted as the \code{bins} parameter of \code{split.data.time.based} +#' or as the \code{bins} parameter of \code{split.data.by.bins} +#' @param split.by.time logical indicating whether splitting is done time-based or activity-bins-based +#' @param number.windows see \code{number.windows} from \code{split.data.time.based} +#' [default: NULL] +#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' [default: "commits"] +#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach +#' [default: FALSE] +#' @param project.conf.new the new project config to construct the \code{RangeData} objects. +#' If \code{NULL}, a clone of \code{project.data$get.project.conf()} will be used. +#' [default: NULL] +#' +#' @return the list of RangeData objects, each referring to one time period +#' +#' @seealso split.data.time.based +#' @seealso split.data.by.bins +split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time, + number.windows = NULL, split.basis = c("commits", "mails", "issues"), + sliding.window = FALSE, project.conf.new = NULL) { + + ## get basis for splitting process + split.basis = match.arg(split.basis) + + ## if the data used by the split basis is not present, load it automatically + if (!(split.basis %in% project.data$get.cached.data.sources("only.unfiltered"))) { + function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[split.basis]] + project.data[[function.name]]() + } + + ## get actual raw data + data.to.split = project.data$get.cached.data.sources("only.unfiltered") + + data = lapply(data.to.split, function(ds) { + ## build the name of the respective getter and call it + function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[ds]] + return(project.data[[function.name]]()) + }) + names(data) = data.to.split + + ## load available additional data sources + additional.data.sources = project.data$get.cached.data.sources("only.additional") + additional.data = lapply(additional.data.sources, function(ds) { + ## build the name of the respective getter and call it + function.name = DATASOURCE.TO.ADDITIONAL.ARTIFACT.FUNCTION[[ds]] + return(project.data[[function.name]]()) + }) + names(additional.data) = additional.data.sources + + ## number of windows given (ignoring time period and bins) + if (!is.null(number.windows)) { + ## reset bins for the later algorithm + bins = NULL + ## remove sliding windows + sliding.window = FALSE + } + + ## indicates if time-based splitting is performed using bins + split.time.based.with.bins = FALSE + + ## if bins are NOT given explicitly + if (is.null(bins)) { + ## get bins based on split.basis + bins = split.get.bins.time.based(data[[split.basis]][["date"]], splitting.length, number.windows)$bins + bins.labels = head(bins, -1) + ## logging + logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.", + project.data$get.class.name(), splitting.length, split.basis) + } + ## when bins are given explicitly, get bins based on parameter + else { + if (split.by.time) { + split.time.based.with.bins = TRUE + split.basis = NULL + bins = get.date.from.string(bins) + bins = get.date.string(bins) + ## remove sliding windows + sliding.window = FALSE + } else { + ## sliding windows do not need to be removed here, as sliding windows and bins + ## are not contradicting in activity-based splitting + bins.vector = bins[["vector"]] + bins = bins[["bins"]] + } + bins.labels = head(bins, -1) + ## logging + logging::loginfo("Splitting data '%s' into time ranges [%s].", + project.data$get.class.name(), paste(bins, collapse = ", ")) + } + bins.date = get.date.from.string(bins) + + ## construct ranges + bins.ranges = construct.ranges(bins) + names(bins.ranges) = bins.ranges + + if ((length(bins.ranges) <= 1) && sliding.window) { + logging::logwarn("Sliding-window approach does not apply for one range or less.") + sliding.window = FALSE + } + + if (is.null(project.conf.new)) { + ## Clone the project configuration, so that splitting repeatedly does not interfere + ## with the same configuration. + project.conf.new = project.data$get.project.conf()$clone() + } + + if (!sliding.window || !split.by.time) { + ## split data + data.split = parallel::mclapply(data.to.split, function(df.name) { + logging::logdebug("Splitting %s.", df.name) + ## identify bins for data + df = data[[df.name]] + df.bins = if (!split.by.time && (df.name == split.basis)) + bins.vector + else + findInterval(df[["date"]], bins.date, all.inside = FALSE) + ## split data according to df.bins + df.split = split(df, df.bins) + ## add proper labels/names + names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) + return(df.split) + }) + ## set the names to the data sources obtained earlier + names(data.split) = data.to.split + + ## re-arrange data to get the proper list of data per range + logging::logdebug("Re-arranging data.") + data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) + names(data.split) = bins.ranges + + ## adapt project configuration + project.conf.new$set.revisions(bins, bins.date) + + ## construct RangeData objects + logging::logdebug("Constructing RangeData objects.") + + cf.data = parallel::mclapply(bins.ranges, function(range) { + logging::logdebug("Constructing data for range %s.", range) + ## construct object for current range + cf.range.data = RangeData$new(project.conf.new, range) + ## get data for current range + df.list = data.split[[range]] + + ## set main data sources: commits, mails, issues + for (data.source in data.to.split) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](df.list[[data.source]]) + } + ## set additional data sources: authors, commit.messages, pasta, synchronicity + for (data.source in additional.data.sources) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](additional.data[[data.source]]) + } + + return(cf.range.data) + }) + + } else { + ## perform different steps for sliding-window approach of time-based splitting + + ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = splitting.length, overlap = 0.5, raw = FALSE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = splitting.length, overlap = 0.5, raw = TRUE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) + bins = get.date.string(bins.date) + + logging::loginfo("Splitting data '%s' into time ranges using sliding windows [%s].", + project.data$get.class.name(), ranges) + cf.data = split.data.time.based.by.ranges(project.data, ranges) + + ## update project configuration + project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) + for (cf in cf.data) { + ## re-set project configuration due to object duplication + cf.conf = cf$set.project.conf(project.conf.new) + } + } + + ## add splitting information to project configuration + project.conf.new$set.splitting.info( + type = if (split.by.time) "time-based" else "activity-based", + length = if (split.time.based.with.bins) { + bins + } + else { + if (!is.null(number.windows)) { + as.character(lubridate::as.period( + get.time.period.by.amount( + min(data[[split.basis]][["date"]]), + max(data[[split.basis]][["date"]]), + number.windows + ) + )) + } + else splitting.length + }, + basis = split.basis, + sliding.window = sliding.window, + revisions = bins, + revisions.dates = bins.date + ) + + ## set bin attribute + attr(cf.data, "bins") = bins.date + + ## return list of RangeData objects + return(cf.data) +} + + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Unification of range names ---------------------------------------------- @@ -1102,13 +1187,18 @@ split.get.bins.time.based = function(dates, time.period, number.windows = NULL) #' @param activity.amount the amount of activity denoting the number of unique items #' in each split bin [default: 5000] #' @param remove.duplicate.bins remove duplicate bin borders? [default: FALSE] +#' @param include.duplicate.ids include entries of the \code{df} with non-unique ids +#' in the creation of the bins. This should! not change bin borders +#' as entries with the same id should! share the same \code{date} attribute. +#' [default: FALSE] #' #' @return a list, #' the item 'vector': the bins each row in 'df' belongs to (increasing integers), #' the item 'bins': the bin labels, described by dates, each bin containing -#' 'acitivity.amount' many unique items; each item in the vector indicates +#' 'activity.amount' many unique items; each item in the vector indicates #' the start of a bin, although the last item indicates the end of the last bin -split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE) { +split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE, + include.duplicate.ids = FALSE) { logging::logdebug("split.get.bins.activity.based: starting") ## get the unique integer IDs for each item in 'id' column ids = df[[id]] @@ -1120,11 +1210,23 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica if (bins.number.complete != 0) rep(seq_len(bins.number.complete), each = activity.amount), rep(bins.number.complete + 1, bins.number.incomplete) ) + + ## pad bins with entries for all duplicate ids + if (include.duplicate.ids) { + bins.activity.padded = c() + for (i in seq_along(ids)) { + ## create an extra entry for every duplicate id in the same bin as + ## the first occurance of the id + current.bin = bins.activity[ which(ids.unique == ids[i]) ] + bins.activity.padded = c(bins.activity.padded, current.bin) + } + bins.activity = bins.activity.padded + } bins.number = max(bins.activity) ## join ids and bin numbers bins.mapping = data.frame( - id = ids.unique, + id = if (include.duplicate.ids) ids else ids.unique, bin = bins.activity )