From a1842e9be46596321ee86860fd87d17a3c88f50f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Sat, 2 Dec 2023 11:46:09 +0100 Subject: [PATCH 01/13] Simplify network splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move setting of the 'bins' attribute on networks from 'split.network.time.based' and 'split.networks.time.based' into 'split.network.by.bins'. This makes calculating the 'bins' attribute in 'split.networks.time.based' obsolete. Introduce 'get.bin.dates.from.ranges' wrapper function to aid in converting from ranges to bins where needed. Simplify conversion from bins to ranges and back in 'split.network.time.based'. Replace unneccessary mapply in 'split.network.time.based.by.ranges' by simpler lapply. Signed-off-by: Maximilian Löffler --- util-split.R | 58 ++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/util-split.R b/util-split.R index 62db7f8a9..12c5049fc 100644 --- a/util-split.R +++ b/util-split.R @@ -472,7 +472,6 @@ split.data.time.based.by.ranges = function(project.data, ranges) { return(data.split) } - ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split networks ---------------------------------------------------------- @@ -534,28 +533,20 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU if (sliding.window) { ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), time.period = time.period, overlap = 0.5, raw = FALSE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = TRUE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) - - logging::loginfo("Splitting network into time ranges [%s].", + include.end.date = FALSE) + logging::loginfo("Splitting network into overlapping time ranges [%s].", paste(ranges, collapse = ", ")) nets = split.network.time.based.by.ranges(network, ranges, remove.isolates) } else { - logging::loginfo("Splitting network into bins [%s].", - paste(bins.date, collapse = ", ")) - nets = split.network.by.bins(network, bins, bins.vector, remove.isolates) + revs = get.date.string(bins.date) + ranges = construct.ranges(revs, sliding.window = FALSE) + logging::loginfo("Splitting network into non-overlapping time ranges [%s].", + paste(ranges, collapse = ", ")) + nets = split.network.by.bins(network, bins, bins.vector, bins.date, remove.isolates) } - ## set bin attribute - attr(nets, "bins") = bins.date - ## set ranges as names - revs = get.date.string(bins.date) - names(nets) = construct.ranges(revs, sliding.window = sliding.window) - + names(nets) = ranges return(nets) } @@ -615,10 +606,6 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = ranges = construct.overlapping.ranges(start = min(dates), end = max(dates), time.period = time.period, overlap = 0.5, raw = FALSE, include.end.date = TRUE) - bins.info = construct.overlapping.ranges(start = min(dates), end = max(dates), - time.period = time.period, overlap = 0.5, raw = TRUE, - include.end.date = TRUE) - bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) } else { bins.info = split.get.bins.time.based(dates, time.period, number.windows) bins.date = get.date.from.string(bins.info[["bins"]]) @@ -636,7 +623,6 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = if (sliding.window) { nets = split.network.time.based.by.ranges(network = net, ranges = ranges, remove.isolates = remove.isolates) - attr(nets, "bins") = bins.date } else { nets = split.network.time.based(network = net, bins = bins.date, sliding.window = sliding.window, remove.isolates = remove.isolates) @@ -717,7 +703,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win bins.vector = bins.vector[ with(df, order(my.unique.id)) ] # re-order to get igraph ordering bins = sort(unique(bins.vector)) ## split network by bins - networks = split.network.by.bins(network, bins, bins.vector, remove.isolates) + networks = split.network.by.bins(network, bins, bins.vector, remove.isolates = remove.isolates) if (number.edges >= edge.count) { logging::logwarn("Sliding-window approach does not apply: not enough edges (%s) for number of edges %s", @@ -818,11 +804,9 @@ split.network.time.based.by.ranges = function(network, ranges, remove.isolates = ranges.bounds = lapply(ranges, get.range.bounds) ## loop over all ranges and split the network accordingly: - nets.split = mapply( - ranges, ranges.bounds, SIMPLIFY = FALSE, - FUN = function(range, start.end) { + nets.split = lapply(ranges.bounds, function(bounds) { ## 1) split the network to the current range - range.net = split.network.time.based(network, bins = start.end, sliding.window = FALSE, + range.net = split.network.time.based(network, bins = bounds, sliding.window = FALSE, remove.isolates = remove.isolates)[[1]] ## 2) return the network @@ -855,10 +839,12 @@ split.dataframe.by.bins = function(df, bins) { #' @param network a network #' @param bins a vector with the unique bin identifiers, describing the order in which the bins are created #' @param bins.vector a vector of length 'ecount(network)' assigning a bin for each edge of 'network' +#' @param bins.date a vector of dates representing the start of each bin. If present, then the dates will be set +#' as an attribute on the returned networks [default: NULL] #' @param remove.isolates whether to remove isolates in the resulting split networks [default: TRUE] #' #' @return a list of networks, with the length of 'unique(bins.vector)' -split.network.by.bins = function(network, bins, bins.vector, remove.isolates = TRUE) { +split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, remove.isolates = TRUE) { logging::logdebug("split.network.by.bins: starting.") ## create a network for each bin of edges nets = parallel::mclapply(bins, function(bin) { @@ -869,6 +855,10 @@ split.network.by.bins = function(network, bins, bins.vector, remove.isolates = T g = igraph::subgraph.edges(network, edges, delete.vertices = remove.isolates) return(g) }) + ## set 'bins' attribute, if specified + if (!is.null(bins.date)) { + attr(nets, "bins") = bins.date + } logging::logdebug("split.network.by.bins: finished.") return(nets) } @@ -1048,7 +1038,7 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), time.period = splitting.length, overlap = 0.5, raw = TRUE, include.end.date = FALSE) # bins have already been prepared correctly - bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) + bins.date = get.bin.dates.from.ranges(bins.info) bins = get.date.string(bins.date) logging::loginfo("Splitting data '%s' into time ranges using sliding windows [%s].", @@ -1094,6 +1084,16 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli return(cf.data) } +#' Obtain the start and end dates from given ranges. +#' +#' @param ranges the ranges to get the dates from +#' +#' @return a sorted vector of all the start the end dates of the given ranges +get.bin.dates.from.ranges = function(ranges) { + dates = sort(unname(unique(get.date.from.unix.timestamp(unlist(ranges))))) + return(dates) +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Unification of range names ---------------------------------------------- From 858b1812ebfc3194cc6a03c99f3ee7d161d1ca15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Sun, 10 Dec 2023 18:35:57 +0100 Subject: [PATCH 02/13] Move 'get.bin.dates.from.ranges' into 'util-misc' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- util-misc.R | 11 +++++++++++ util-split.R | 10 ---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/util-misc.R b/util-misc.R index 88817a616..afff72660 100644 --- a/util-misc.R +++ b/util-misc.R @@ -1011,3 +1011,14 @@ get.data.from.range = function(range, data) { return(data.between) } } + +#' Obtain the start and end dates from given ranges. +#' +#' @param ranges the ranges to get the dates from +#' +#' @return a sorted vector of all the start the end dates of the given ranges +get.bin.dates.from.ranges = function(ranges) { + dates = sort(unname(unique(get.date.from.unix.timestamp(unlist(ranges))))) + return(dates) +} + diff --git a/util-split.R b/util-split.R index 12c5049fc..f08cd3246 100644 --- a/util-split.R +++ b/util-split.R @@ -1084,16 +1084,6 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli return(cf.data) } -#' Obtain the start and end dates from given ranges. -#' -#' @param ranges the ranges to get the dates from -#' -#' @return a sorted vector of all the start the end dates of the given ranges -get.bin.dates.from.ranges = function(ranges) { - dates = sort(unname(unique(get.date.from.unix.timestamp(unlist(ranges))))) - return(dates) -} - ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Unification of range names ---------------------------------------------- From fa3167c289c9785f3a5db03d9724848f1441a63d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Sun, 21 Jan 2024 11:46:46 +0100 Subject: [PATCH 03/13] Deduplicate JIRA issue edges in undirected networks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For JIRA issue data, the codeface-extraction creates additional 'add_link' issue-events back from the referenced issue to the referencing issue. Since PR se-sic:codeface-extraction#51, the extraction also automatically adds corresponding 'referenced_by' events. We need to deduplicate both. Signed-off-by: Maximilian Löffler --- util-networks.R | 52 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/util-networks.R b/util-networks.R index e21b40791..a8cc08f70 100644 --- a/util-networks.R +++ b/util-networks.R @@ -486,11 +486,55 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", artifacts.net.data.raw = private$proj.data[[DATASOURCE.TO.ARTIFACT.FUNCTION[["issues"]]]]() ## obtain issue-connecting events - add.links = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "add_link" & + add.links = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "add_link" & artifacts.net.data.raw$event.info.2 == "issue", ] - referenced.bys = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "referenced_by" & + referenced.bys = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "referenced_by" & artifacts.net.data.raw$event.info.2 == "issue", ] + ## the codeface extraction for jira issues creates duplicate events, linking the referenced issue + ## to the referencing issue, in addition to the correct events, linking the referencing issue to + ## the referenced issue. We can only deduplicate them, if we build an undirected network, as otherwise, + ## we would need to guess the correct direction. + if (!private$network.conf$get.entry("artifact.directed")) { + + ## obtain add_link events from jira + jira.add.links = add.links[add.links$issue.source == "jira", ] + matched = c() + + ## iterate over all add_link events from jira + for (i in 1:nrow(jira.add.links)) { + + add.link = jira.add.links[i, ] + if (all(add.link %in% matched)) { + next + } + + ## match any add_link events, that are the reverse direction of 'add.link', + ## but the same timestamp and author information. + match = jira.add.links[( + jira.add.links$issue.id == add.link$event.info.1 & + jira.add.links$event.info.1 == add.link$issue.id & + jira.add.links$date == add.link$date & + jira.add.links$author.name == add.link$author.name), ] + + ## if a match is found, remove 'add.link' and its corresponding referenced_by event + if (nrow(match) > 0) { + add.links = add.links[!( + add.links$issue.id == add.link$issue.id & + add.links$event.info.1 == add.link$event.info.1 & + add.links$date == add.link$date & + add.links$author.name == add.link$author.name), ] + referenced.bys = referenced.bys[!( + referenced.bys$issue.id == match$issue.id & + referenced.bys$event.info.1 == match$event.info.1 & + referenced.bys$date == match$date & + referenced.bys$author.name == match$author.name), ] + matched = c(match, add.link) + } + } + } + + if (nrow(add.links) != nrow(referenced.bys)) { logging::logwarn("Inconsistent issue data. Unequally many 'add_link' and 'referenced_by' issue-events.") } @@ -515,8 +559,8 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## construct edge to = subset(referenced.bys, - event.info.1 == from[["issue.id"]] & - author.name == from[["author.name"]] & + event.info.1 == from[["issue.id"]] & + author.name == from[["author.name"]] & date == from[["date"]]) if (!all(is.na(to))) { combination = list("from" = from[["issue.id"]], "to" = to[["issue.id"]]) From 943228fbc91eed6854dacafa7075441e58b22675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Sun, 21 Jan 2024 12:03:43 +0100 Subject: [PATCH 04/13] Adjust sliding-window approach in network-based splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adapt the sliding-window data-splitting approach from PR #244 to network splitting, especially the removal of the last (redundant) range. * Fix an edge-case bug in said algorithm, where the wrong last split is removed. This could happen when the subject to split can be exactly divided by the desired size of a split. * Fix a faulty test, that only became apparent after the adjustments to the sliding-window algorithm in network-splitting Signed-off-by: Maximilian Löffler --- tests/test-split-network-activity-based.R | 2 +- util-split.R | 60 +++++++++++------------ 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/test-split-network-activity-based.R b/tests/test-split-network-activity-based.R index 52d7b8f03..2ee0bc7d6 100644 --- a/tests/test-split-network-activity-based.R +++ b/tests/test-split-network-activity-based.R @@ -162,7 +162,7 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), - "2016-07-12 16:06:10-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(7, 6)), "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) ) results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) diff --git a/util-split.R b/util-split.R index f08cd3246..2738daf1c 100644 --- a/util-split.R +++ b/util-split.R @@ -307,7 +307,8 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## will be a sliding range (which started at the half of the last regular range) which ## contains only items also included in the last regular range, which makes the sliding ## range obsolete. - if ((items.unique.count %% activity.amount) > offset.start) { + length.of.last.range = items.unique.count %% activity.amount + if (length.of.last.range > offset.start || length.of.last.range == 0) { cf.data.sliding = cf.data.sliding[-length(cf.data.sliding)] bins.date.middle = bins.date.middle[-length(bins.date.middle)] } else { @@ -721,16 +722,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## offsets used for cropping (half the first/last bin) offset.start = floor(number.edges / 2) - offset.end = (edge.count - offset.start) %% number.edges - ## cut the data appropriately - if (offset.end > 0) { - edges.cut = c( - edges.by.date[seq_len(offset.start)], - edges.by.date[seq(from = (edge.count - offset.end + 1), to = edge.count)] - ) - } else { - edges.cut = edges.by.date[seq_len(offset.start)] - } + edges.cut = edges.by.date[seq_len(offset.start)] ## delete edges from the network and create a new network network.cut = igraph::delete.edges(network, igraph::E(network)[edges.cut]) @@ -739,33 +731,41 @@ split.network.activity.based = function(network, number.edges = 5000, number.win networks.sliding = split.network.activity.based(network.cut, number.edges = number.edges, sliding.window = FALSE) - ## append data to normally-split data - networks = append(networks, networks.sliding) - ## compute bins for sliding windows: pairwise middle between dates bins.date.middle = attr(networks.sliding, "bins") - ## sort data object properly by bin starts - bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) - networks = networks[ order(bins.ranges.start) ] - - ## construct proper bin vectors for configuration - bins.date = sort(c(bins.date, bins.date.middle)) - - ## if the last regular range and the last sliding-window range end at the same time - ## and the latter contains the former's edges, then: - ## remove the last regular range as it is not complete and we don't loose data when removing it - edges.last.regular = igraph::E(networks[[length(networks)]]) - edges.last.sliding = igraph::E(networks[[length(networks) - 1]]) - if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] - && all(edges.last.regular %in% edges.last.sliding) - && table(edges.last.regular$date) %in% table(edges.last.sliding$date) ) { - + ## Both, the last sliding network and the last regular network end at the very last edge. + ## This is the case because the end of the edges is never cropped (like the beginning is). + ## Both split.network.activity.based, and split.network.by.bins, which are invoked to obtain + ## the two set of networks, creates networks until all edges are contained. + ## + ## The conditional below inspects whether the very last edge is in the first or the second + ## half of the last regular network. If it is in the first half, there will be a sliding + ## network which covers all edges of the last regular network which makes the last regular + ## network obsolete. + ## Similarely if the last edge is in the second half of the last regular network, there + ## will be a sliding network (which started at the half of the last regular network) which + ## contains only edges also included in the last regular network, which makes the sliding + ## network obsolete. + length.of.last.range = edge.count %% number.edges + if (length.of.last.range > offset.start || length.of.last.range == 0) { + networks.sliding = networks.sliding[-length(networks.sliding)] + bins.date.middle = bins.date.middle[-length(bins.date.middle)] + } else { networks = networks[-length(networks)] bins.date = bins.date[-length(bins.date)] bins = bins[-length(bins)] } + ## append sliding networks to normally-split networks + networks = append(networks, networks.sliding) + + ## sort networks properly by bin starts + bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) + networks = networks[ order(bins.ranges.start) ] + + ## construct proper bin vectors for configuration + bins.date = sort(c(bins.date, bins.date.middle)) } ## set bin attribute From 5a2021ddc3c270f41a80ff998cf7799bcbe73a28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Sun, 21 Jan 2024 15:24:31 +0100 Subject: [PATCH 05/13] Update Copyright Headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- tests/test-split-network-activity-based.R | 1 + util-misc.R | 2 +- util-networks.R | 2 +- util-split.R | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test-split-network-activity-based.R b/tests/test-split-network-activity-based.R index 2ee0bc7d6..472f91b71 100644 --- a/tests/test-split-network-activity-based.R +++ b/tests/test-split-network-activity-based.R @@ -15,6 +15,7 @@ ## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2022 by Jonathan Baumann +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, activity-based splitting of networks.") diff --git a/util-misc.R b/util-misc.R index afff72660..d6ced669c 100644 --- a/util-misc.R +++ b/util-misc.R @@ -20,7 +20,7 @@ ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2021 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann -## Copyright 2022-2023 by Maximilian Löffler +## Copyright 2022-2024 by Maximilian Löffler ## All Rights Reserved. diff --git a/util-networks.R b/util-networks.R index a8cc08f70..065b82a26 100644 --- a/util-networks.R +++ b/util-networks.R @@ -21,7 +21,7 @@ ## Copyright 2020 by Anselm Fehnker ## Copyright 2021 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. diff --git a/util-split.R b/util-split.R index 2738daf1c..f76fa66b4 100644 --- a/util-split.R +++ b/util-split.R @@ -22,7 +22,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. From 46d581d5e1f63260692b396a8bd8f51b0da48fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Thu, 25 Jan 2024 22:25:49 +0100 Subject: [PATCH 06/13] Remove the other JIRA issue edge while deduplicating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we only deduplicate JIRA issue edges when we build undirected networks, it does not matter which of the two duplicates we remove. Some tests can be adjusted more easily to the JIRA edge deduplication, by swapping which of the edges get removed. Signed-off-by: Maximilian Löffler --- util-networks.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/util-networks.R b/util-networks.R index 065b82a26..891d50402 100644 --- a/util-networks.R +++ b/util-networks.R @@ -520,15 +520,15 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## if a match is found, remove 'add.link' and its corresponding referenced_by event if (nrow(match) > 0) { add.links = add.links[!( - add.links$issue.id == add.link$issue.id & - add.links$event.info.1 == add.link$event.info.1 & - add.links$date == add.link$date & + add.links$issue.id == match$issue.id & + add.links$event.info.1 == match$event.info.1 & + add.links$date == match$date & add.links$author.name == add.link$author.name), ] referenced.bys = referenced.bys[!( - referenced.bys$issue.id == match$issue.id & - referenced.bys$event.info.1 == match$event.info.1 & - referenced.bys$date == match$date & - referenced.bys$author.name == match$author.name), ] + referenced.bys$issue.id == add.link$issue.id & + referenced.bys$event.info.1 == add.link$event.info.1 & + referenced.bys$date == add.link$date & + referenced.bys$author.name == add.link$author.name), ] matched = c(match, add.link) } } From 6eb731102301b1af08f4affb40d1f8df94500e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Thu, 25 Jan 2024 22:33:21 +0100 Subject: [PATCH 07/13] Add duplicate JIRA issue events to testing data and adjust tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the codeface extraction produces duplicate 'add_link' and 'referenced_by' events from JIRA repositories, the coronet testing data should reflect that. Adjust the tests to work with added JIRA issue events. Signed-off-by: Maximilian Löffler --- .../test_feature/feature/issues-jira.list | 2 + .../test_proximity/proximity/issues-jira.list | 2 + tests/test-networks-artifact.R | 22 ++++- tests/test-networks-author.R | 41 +++++----- tests/test-read.R | 58 ++++++------- tests/test-split-data-activity-based.R | 82 +++++++++---------- tests/test-split-data-time-based.R | 52 ++++++------ 7 files changed, 140 insertions(+), 119 deletions(-) diff --git a/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list b/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list index c39f31ed2..9d443053e 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list +++ b/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list @@ -12,6 +12,8 @@ "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-06-01 06:50:26";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"resolution_updated";"Björn";"bjoern@example.org";"2013-06-01 06:53:06";"fixed";"""unresolved""" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"created";"Björn";"bjoern@example.org";"2016-07-12 16:01:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-12 16:02:30";"open";"[""unresolved""]" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list b/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list index c39f31ed2..9d443053e 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list @@ -12,6 +12,8 @@ "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-06-01 06:50:26";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"resolution_updated";"Björn";"bjoern@example.org";"2013-06-01 06:53:06";"fixed";"""unresolved""" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"created";"Björn";"bjoern@example.org";"2016-07-12 16:01:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-12 16:02:30";"open";"[""unresolved""]" diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index 849a4ad17..88d77bb11 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -117,11 +117,11 @@ patrick::with_parameters_test_that("Network construction of an issue-based artif type = TYPE.ARTIFACT) ## 2) edges edges = data.frame( - from = c("", "", ""), - to = c("", "", ""), + from = c("", "", ""), + to = c("", "", ""), date = get.date.from.string(c("2016-08-07 15:30:00", "2016-08-07 15:37:02", "2017-05-21 12:00:00")), artifact.type = c("IssueEvent", "IssueEvent", "IssueEvent"), - issue.id = c("", "", ""), + issue.id = c("", "", ""), event.name = c("add_link", "add_link", "add_link"), author.name = c("Thomas", "Karl", "Thomas"), weight = c(1, 1, 1), @@ -129,6 +129,22 @@ patrick::with_parameters_test_that("Network construction of an issue-based artif relation = "issue" ) + ## 3) when constructing directed networks, we cannot deduplicate jira edges + if (test.directed) { + edges = rbind(edges, data.frame( + from = "", + to = "", + date = get.date.from.string("2017-05-21 12:00:00"), + artifact.type = "IssueEvent", + issue.id = "", + event.name = "add_link", + author.name = "Thomas", + weight = 1, + type = TYPE.EDGES.INTRA, + relation = "issue" + )) + } + ## configurations proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("issues.only.comments", FALSE) diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index f2c250a96..4de1460d9 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -494,8 +494,8 @@ test_that("Network construction of the undirected author-issue network with all rep("Olaf", 3), # rep("Thomas", 4), rep("Karl", 3), rep("Björn", 7), rep("Olaf", 3), rep("Thomas", 3), rep("Thomas", 7), rep("Thomas", 3), rep("Björn", 6), rep("Olaf", 2), rep("Olaf", 6), # - rep("Thomas", 10), rep("Thomas", 7), rep("Olaf", 11), # - rep("Björn", 6), rep("Thomas", 4), rep("Thomas", 4) # + rep("Thomas", 11), rep("Thomas", 8), rep("Olaf", 11), # + rep("Björn", 6), rep("Thomas", 5), rep("Thomas", 5) # ), to = c(rep("Olaf", 5), rep("Björn", 4), rep("Björn", 3), # rep("Björn", 4), rep("Björn", 3), rep("Olaf", 3), # @@ -503,8 +503,8 @@ test_that("Network construction of the undirected author-issue network with all rep("Björn", 3), # rep("udo", 4), rep("udo", 3), rep("udo", 7), rep("udo", 3), rep("Karl", 3), rep("Björn", 7), rep("Olaf", 3), rep("Karl", 6), rep("Karl", 2), rep("Björn", 6), # - rep("Björn", 10), rep("Olaf", 7), rep("Björn", 11), # - rep("Max", 6), rep("Björn", 4), rep("Max", 4) # + rep("Björn", 11), rep("Olaf", 8), rep("Björn", 11), # + rep("Max", 6), rep("Björn", 5), rep("Max", 5) # ), date = get.date.from.string(c( "2016-07-12 15:59:25", "2016-07-12 15:59:25", "2016-07-12 15:59:59", # "2016-07-12 16:01:01", "2016-07-14 13:37:00", "2016-07-12 15:59:25", @@ -537,25 +537,26 @@ test_that("Network construction of the undirected author-issue network with all "2016-10-05 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:32:39", "2017-05-23 12:31:34", "2013-04-21 23:52:09", "2013-04-21 23:52:09", "2017-05-21 12:00:00", # - "2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", - "2013-05-06 01:04:34", "2013-05-25 03:48:41", "2013-05-25 04:08:07", - "2013-06-01 06:53:06", "2013-04-21 23:52:09", "2013-04-21 23:52:09", - "2017-05-21 12:00:00", "2013-05-25 03:25:06", "2013-05-25 06:06:53", - "2013-05-25 06:22:23", "2013-06-01 06:50:26", "2013-05-05 21:46:30", - "2013-05-05 21:49:21", "2013-05-05 21:49:34", "2013-05-06 01:04:34", - "2013-05-25 03:48:41", "2013-05-25 04:08:07", "2013-06-01 06:53:06", + "2017-05-21 12:00:00", "2013-05-05 21:46:30", "2013-05-05 21:49:21", + "2013-05-05 21:49:34", "2013-05-06 01:04:34", "2013-05-25 03:48:41", + "2013-05-25 04:08:07", "2013-06-01 06:53:06", "2013-04-21 23:52:09", + "2013-04-21 23:52:09", "2017-05-21 12:00:00", "2017-05-21 12:00:00", "2013-05-25 03:25:06", "2013-05-25 06:06:53", "2013-05-25 06:22:23", - "2013-06-01 06:50:26", + "2013-06-01 06:50:26", "2013-05-05 21:46:30", "2013-05-05 21:49:21", + "2013-05-05 21:49:34", "2013-05-06 01:04:34", "2013-05-25 03:48:41", + "2013-05-25 04:08:07", "2013-06-01 06:53:06", "2013-05-25 03:25:06", + "2013-05-25 06:06:53", "2013-05-25 06:22:23", "2013-06-01 06:50:26", "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", # "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52", "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", - "2017-05-21 12:00:00", "2016-07-15 20:07:47", "2016-07-27 20:12:08", - "2016-07-28 06:27:52", "2017-05-21 12:00:00" + "2017-05-21 12:00:00", "2017-05-21 12:00:00", "2016-07-15 20:07:47", + "2016-07-27 20:12:08", "2016-07-28 06:27:52", "2017-05-21 12:00:00", + "2017-05-21 12:00:00" )), artifact.type = "IssueEvent", issue.id = c(rep("", 12), rep("", 10), rep("", 18), - rep("", 3), rep("", 44), rep("", 28), - rep("", 14)), + rep("", 3), rep("", 44), rep("", 30), + rep("", 16)), event.name = c("created", "commented", "state_updated", "commented", "state_updated", "created", # "commented", "state_updated", "commented", "commented", "state_updated", "commented", "created", "commented", "merged", "state_updated", "created", "commented", "referenced_by", # @@ -570,12 +571,12 @@ test_that("Network construction of the undirected author-issue network with all "subscribed", "mentioned", "subscribed", "commented", "commented", "add_link", "labeled", "referenced_by", "mentioned", "subscribed", "mentioned", "subscribed", "commented", "referenced_by", "labeled", "mentioned", "subscribed", "mentioned", "subscribed", "commented", "labeled", - "created", "commented", "referenced_by", "commented", "commented", "commented", "commented", # - "commented", "commented", "resolution_updated", "created", "commented", "referenced_by", "commented", "commented", - "commented", "commented", "commented", "commented", "commented", "commented", "commented", + "created", "commented", "referenced_by", "add_link", "commented", "commented", "commented", "commented", # + "commented", "commented", "resolution_updated", "created", "commented", "referenced_by", "add_link", "commented", + "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "resolution_updated", "commented", "commented", "commented", "commented", "created", "commented", "commented", "commented", "commented", "commented", "created", # - "commented", "commented", "add_link", "commented", "commented", "commented", "add_link" + "commented", "commented", "referenced_by", "add_link", "commented", "commented", "commented", "referenced_by", "add_link" ), weight = 1, type = TYPE.EDGES.INTRA, diff --git a/tests/test-read.R b/tests/test-read.R index ec8f95bc4..4a7f000e3 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -351,59 +351,58 @@ test_that("Read and parse the issue data.", { issue.data.read.github = read.issues(proj.conf$get.value("datapath.issues"), proj.conf$get.value("issues.from.source")) ## build the expected data.frame - issue.data.expected = data.frame(issue.id = c(rep("", 14), rep("", 7), + issue.data.expected = data.frame(issue.id = c(rep("", 15), rep("", 8), rep("", 9), rep("", 11), rep("", 6), rep("", 5), rep("", 3)), - issue.title = c(rep("[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name", 14), - rep("[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", 7), + issue.title = c(rep("[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name", 15), + rep("[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", 8), rep("Error in construct.networks.from.list for openssl function networks", 9), rep("Distinguish directedness of networks and edge-construction algorithm", 11), rep("Example pull request 1", 6), rep("Example pull request 2", 5), rep("Example pull request 4", 3)), - issue.type = I(c(rep(list(list("issue" , "bug")), 14), rep(list(list("issue" , "bug")), 7), + issue.type = I(c(rep(list(list("issue" , "bug")), 15), rep(list(list("issue" , "bug")), 8), rep(list(list("issue" , "bug")), 9), rep(list(list("issue", "bug", "enhancement")), 11), rep(list(list("pull request")), 6), rep(list(list("pull request")), 5), rep(list(list("pull request", "enhancement")), 3))), - issue.state = c(rep("closed", 14), rep("open", 7), rep("closed", 9), rep("open", 11), + issue.state = c(rep("closed", 15), rep("open", 8), rep("closed", 9), rep("open", 11), rep("reopened", 6), rep("closed", 5), rep("open", 3)), - issue.resolution = I(c(rep(list(list("fixed")), 14), rep(list(list("unresolved")), 7), + issue.resolution = I(c(rep(list(list("fixed")), 15), rep(list(list("unresolved")), 8), rep(list(list()), 9), rep(list(list()), 11), rep(list(list()), 6), rep(list(list()), 5), rep(list(list()), 3))), - creation.date = get.date.from.string(c(rep("2013-04-21 23:52:09", 14), - rep("2016-07-12 16:01:30", 7), + creation.date = get.date.from.string(c(rep("2013-04-21 23:52:09", 15), + rep("2016-07-12 16:01:30", 8), rep("2016-07-12 15:59:25", 9), rep("2016-07-12 14:30:13", 11), rep("2016-07-14 13:37:00", 6), rep("2016-07-12 14:59:25", 5), rep("2016-07-12 16:02:02", 3))), - closing.date = get.date.from.string(c(rep("2013-05-25 20:02:08", 14), rep(NA, 7), + closing.date = get.date.from.string(c(rep("2013-05-25 20:02:08", 15), rep(NA, 8), rep("2016-07-12 16:06:30", 9), rep(NA, 11), rep(NA, 6), rep("2016-07-12 16:04:59", 5), rep(NA, 3))), - issue.components = I(c(rep(list(list("GUI" , "Interpreters")), 14), rep(list(list("Interpreters")), 7), + issue.components = I(c(rep(list(list("GUI" , "Interpreters")), 15), rep(list(list("Interpreters")), 8), rep(list(list()), 9), rep(list(list()), 11), rep(list(list()), 6), rep(list(list()), 5), rep(list(list()), 3))), event.name = c("created", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", - "resolution_updated", "referenced_by", "add_link", "created", "commented", "commented", "commented", "commented", - "commented", "created", "assigned", "commented", "state_updated", "add_link", "referenced", - "referenced", "add_link", "add_link", "mentioned", "subscribed", "commented", "mentioned", - "subscribed", "add_link", "mentioned", "subscribed", "labeled", "commented", "referenced_by", - "created", "commented", "state_updated", "commented", "commented", "state_updated", - "created", "commented", "merged", "state_updated", - "referenced_by", "commit_added", "created", "commented"), - author.name = c("Thomas", "Thomas", "Björn", "Björn", "Björn", "Björn", "Olaf", "Björn", - "Björn", "Olaf", "Olaf", "Olaf", "Björn", "Thomas", "Thomas", "Björn", "Björn", "Björn", "Max", - "Max", "Max", "Karl", "Olaf", "Karl", "Olaf", "Karl", "Karl", "Thomas", "Karl", "Thomas", "udo", - "udo", "Thomas", "Björn", "Björn", "Thomas", "Björn", "Björn", "Olaf", "Björn", - "Karl", "Thomas", "Thomas", "Thomas", "Olaf", "Björn", "Olaf", + "resolution_updated", "referenced_by", "add_link", "referenced_by", "add_link", "created", + "commented", "commented", "commented", "commented", "commented", "created", "assigned", "commented", + "state_updated", "add_link", "referenced", "referenced", "add_link", "add_link", "mentioned", "subscribed", + "commented", "mentioned", "subscribed", "add_link", "mentioned", "subscribed", "labeled", "commented", + "referenced_by", "created", "commented", "state_updated", "commented", "commented", "state_updated", + "created", "commented", "merged", "state_updated", "referenced_by", "commit_added", "created", "commented"), + author.name = c("Thomas", "Thomas", "Björn", "Björn", "Björn", "Björn", "Olaf", "Björn", "Björn", "Olaf", "Olaf", "Olaf", + "Björn", "Thomas", "Thomas", "Thomas", "Thomas", "Björn", "Björn", "Björn", "Max", "Max", "Max", "Karl", + "Olaf", "Karl", "Olaf", "Karl", "Karl", "Thomas", "Karl", "Thomas", "udo", "udo", "Thomas", "Björn", "Björn", + "Thomas", "Björn", "Björn", "Olaf", "Björn", "Karl", "Thomas", "Thomas", "Thomas", "Olaf", "Björn", "Olaf", "Björn", "Björn", "Olaf", "Olaf", "Thomas", "Björn", "Olaf", "Olaf"), author.email = c("thomas@example.org", "thomas@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", "olaf@example.org", "olaf@example.org", - "bjoern@example.org", "thomas@example.org", "thomas@example.org", "bjoern@example.org", + "bjoern@example.org", "thomas@example.org", "thomas@example.org", + "thomas@example.org", "thomas@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "max@example.org", "max@example.org", "max@example.org", "karl@example.org", "olaf@example.org", "karl@example.org", "olaf@example.org", "karl@example.org", "karl@example.org", @@ -423,6 +422,7 @@ test_that("Read and parse the issue data.", { "2013-05-25 04:08:07", "2013-05-25 06:06:53", "2013-05-25 06:22:23", "2013-06-01 06:50:26", "2013-06-01 06:53:06", "2017-05-21 12:00:00", + "2017-05-21 12:00:00", "2017-05-21 12:00:00", "2017-05-21 12:00:00", "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", "2016-07-15 20:07:47", "2016-07-27 20:12:08", @@ -445,8 +445,8 @@ test_that("Read and parse the issue data.", { "2016-07-12 16:02:02", "2016-07-12 16:02:02", "2016-07-12 16:02:02")), event.info.1 = c("open", "open", "open", "open", "open", "open", "open", "open", "open", - "open", "open", "open", "fixed", "", - "", "open", "open", "open", "open", "open", "open", "open", + "open", "open", "open", "fixed", "", "", + "", "", "open", "open", "open", "open", "open", "open", "open", "", "open", "closed", "930af63a030fb92e48eddff01f53284c3eeba80e", "", "", "", "", "Thomas", "Thomas", "open", "Thomas", "Thomas", "fb52357f05958007b867da06f4077abdc04fa0d8", "udo", "udo", "decided", "open", "", "open", "open", "closed", "closed", "closed", "open", @@ -454,7 +454,7 @@ test_that("Read and parse the issue data.", { "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "open", "open"), event.info.2 = NA, # is assigned later event.id = NA, # is assigned later - issue.source = c(rep("jira", 21), rep("github", 20), rep("github", 14)), + issue.source = c(rep("jira", 23), rep("github", 20), rep("github", 14)), artifact.type = "IssueEvent" ) @@ -462,9 +462,9 @@ test_that("Read and parse the issue data.", { list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), - "unresolved", "issue", "issue", list("unresolved"), list("unresolved"), list("unresolved"), - list("unresolved"), list("unresolved"), list("unresolved"), list(), "", list(), "open", - "commit", "", "", "issue", "issue", "thomas@example.org", "thomas@example.org", list(), + "unresolved", "issue", "issue", "issue", "issue", list("unresolved"), list("unresolved"), + list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list(), "", list(), + "open", "commit", "", "", "issue", "issue", "thomas@example.org", "thomas@example.org", list(), "thomas@example.org", "thomas@example.org", "commit", "udo@example.org", "udo@example.org", "", list(), "issue", list(), list(), "open", list(), list(), "closed", list(), list(), "", "open", "issue", diff --git a/tests/test-split-data-activity-based.R b/tests/test-split-data-activity-based.R index c4879aa47..c5bfec9df 100644 --- a/tests/test-split-data-activity-based.R +++ b/tests/test-split-data-activity-based.R @@ -114,8 +114,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -193,7 +193,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(16:17, 22:25, 33, 42:46, 50:51, 53:55), ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(18:19, 24:27, 35, 44:48, 52:53, 55:57), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -307,8 +307,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$issues[0, ], "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 31:32, 48:49), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 33:34, 50:51), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:47, 52:53, 55:57), ], "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] ), mails = list( @@ -396,7 +396,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails @@ -508,11 +508,11 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 22:24, 31:32, 42:43, 48:49), ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(16:17, 33, 44:46, 50:51, 53:55), ], - "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(18:21, 25:26, 29:30, 47, 52), ], - "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:15, 27:28, 34:38, 41), ], - "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(39:40), ] + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 24:26, 33:34, 44:45, 50:51), ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(18:19, 35, 46:48, 52:53, 55:57), ], + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(20:23, 27:28, 31:32, 49, 54), ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:17, 29:30, 36:40, 43), ], + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(41:42), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -578,7 +578,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 65, + split.length = 67, split.basis = "issues", split.sliding.window = FALSE, split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), @@ -705,10 +705,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ] + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(18:19, 35, 47:48, 52:53, 55:57), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 27, ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -789,7 +789,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(16:17, 22:25, 33, 42:46, 50:51, 53:55), ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(18:19, 24:27, 35, 44:48, 52:53, 55:57), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -902,10 +902,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45:46, 50:51, 53:55), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], - "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 25, ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(18:19, 35, 47:48, 52:53, 55:57), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -1050,10 +1050,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$issues[0, ], "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$issues[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 31:32, 48:49), ], - "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 31:32, 48:49), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:45, 50:51, 53:55), ] + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 33:34, 50:51), ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 33:34, 50:51), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:47, 52:53, 55:57), ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:47, 52:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1152,7 +1152,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails @@ -1277,15 +1277,15 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], - "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$issues[rownames(data$issues) %in% c(6:13, 48:49), ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 22:24, 31:32, 42:43, 48:49), ], - "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(16, 22:24, 31:32, 42:45, 53), ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(16:17, 33, 44:46, 50:51, 53:55), ], - "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(17:19, 25, 33, 46:47, 50:51, 54:55), ], - "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(18:21, 25:26, 29:30, 47, 52), ], - "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$issues[rownames(data$issues) %in% c(20:21, 26:27, 29:30, 34:35, 41, 52), ], - "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:15, 27:28, 34:38, 41), ], - "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:15, 28, 36:40), ] + "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$issues[rownames(data$issues) %in% c(6:13, 50:51), ], + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 24:26, 33:34, 44:45, 50:51), ], + "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(18, 24:26, 33:34, 44:47, 55), ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(18:19, 35, 46:48, 52:53, 55:57), ], + "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(19:21, 27, 35, 48:49, 52:53, 56:57), ], + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(20:23, 27:28, 31:32, 49, 54), ], + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$issues[rownames(data$issues) %in% c(22:23, 28:29, 31:32, 36:37, 43, 54), ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:17, 29:30, 36:40, 43), ], + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:17, 30, 38:42), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1363,7 +1363,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 65, + split.length = 67, split.basis = "issues", split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), @@ -1484,8 +1484,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 33, 42:46, 50:51, 53:55), ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 25, ] + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 27, ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[15:16, ], # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -1611,7 +1611,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ), issues = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1737,8 +1737,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$commit.messages ), issues = list( - "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$issues[rownames(data$issues) %in% c(1:13, 16:17, 22:24, 31:32, 42:45, 48:49, 53:55), ], - "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:15, 18:21, 26:29, 30, 33:40, 25, 41, 46:47, 50:52), ] + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:34, 44:47, 50:51, 55:57), ], + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:17, 20:23, 28:31, 32, 35:42, 27, 43, 48:49, 52:54), ] ), mails = list( ## comments indicate row names when pasta is not configured diff --git a/tests/test-split-data-time-based.R b/tests/test-split-data-time-based.R index 92853da3a..7d115ab4a 100644 --- a/tests/test-split-data-time-based.R +++ b/tests/test-split-data-time-based.R @@ -114,9 +114,9 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(16, 22:24, 42:45), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(17, 33, 53:55), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(25, 46, 50:51), ] + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(18, 24:26, 44:47), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(19, 35, 55:57), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(27, 48, 52:53), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -231,7 +231,7 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -342,8 +342,8 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis ), issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ], - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:15, 39:40), ] + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ], + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:17, 41:42), ] ), mails = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], @@ -459,11 +459,11 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(16, 22:24, 42:45), ], - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(16:17, 45, 53:55), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(17, 33, 53:55), ], - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(33, 46, 50:51), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(25, 46, 50:51), ] + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(18, 24:26, 44:47), ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(18:19, 47, 55:57), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(19, 35, 55:57), ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(35, 48, 52:53), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(27, 48, 52:53), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -598,7 +598,7 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$issues[0, ], "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -724,8 +724,8 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$issues[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ], - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:55), ] + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -831,7 +831,7 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... ) "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages ), issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ] + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ] ), mails = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ] @@ -928,8 +928,8 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... , "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commit.messages ), issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(16:38, 41:55), ], - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% c(14:15, 39:40), ] + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% c(14:17, 41:42), ] ), mails = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], @@ -1075,10 +1075,10 @@ patrick::with_parameters_test_that("Split a data object time-based using custom "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$commit.messages ), issues = list( - "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$issues[rownames(data$issues) %in% c(22:24, 31:32, 42:44), ], - "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$issues[rownames(data$issues) %in% c(16:17, 33, 45, 50:51, 53:55), ], - "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$issues[rownames(data$issues) %in% c(18:21, 25:26, 29:30, 41, 46:47, 52), ], - "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$issues[rownames(data$issues) %in% c(27, 34), ] + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$issues[rownames(data$issues) %in% c(24:26, 33:34, 44:46), ], + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$issues[rownames(data$issues) %in% c(18:19, 35, 47, 52:53, 55:57), ], + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$issues[rownames(data$issues) %in% c(20:23, 27:28, 31:32, 43, 48:49, 54), ], + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$issues[rownames(data$issues) %in% c(29, 36), ] ), mails = list( ## comments indicate rownames when pasta is not configured @@ -1311,9 +1311,9 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$issues[rownames(data$issues) %in% c(22:24, 42:45), ], - "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$issues[rownames(data$issues) %in% c(16:17, 33, 53:55), ], - "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(25, 46, 50:51), ] + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$issues[rownames(data$issues) %in% c(24:26, 44:47), ], + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$issues[rownames(data$issues) %in% c(18:19, 35, 55:57), ], + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(27, 48, 52:53), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$mails[0, ], @@ -1427,7 +1427,7 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$issues[0, ], "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$issues[0, ], "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(16:17, 22:24, 31:33, 42:45, 48:51, 53:55), ] + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1539,7 +1539,7 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si issues = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$issues[rownames(data$issues) %in% 1:13, ], "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$issues[0, ], - "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:55, ] + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:57, ] ), mails = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$mails[0, ], From c064affcfff2eb170d8bdcb39d837a7ff62b2cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Mon, 29 Jan 2024 10:33:19 +0100 Subject: [PATCH 08/13] Adjust data-splitting tests to test for 'split.revisions.dates' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace unneccessary 'split.revision.dates' in expected configs, as it is not even an attribute of the splitting-info, and therefore will always be NULL. Instead use 'split.revisions.dates', which represents the revisions in POSIXct format (and is equivalent to the 'bins' attribute set on the split networks. Signed-off-by: Maximilian Löffler --- tests/test-split-data-activity-based.R | 107 ++++++++++++++----------- tests/test-split-data-time-based.R | 83 ++++++++++--------- 2 files changed, 108 insertions(+), 82 deletions(-) diff --git a/tests/test-split-data-activity-based.R b/tests/test-split-data-activity-based.R index c5bfec9df..363405b60 100644 --- a/tests/test-split-data-activity-based.R +++ b/tests/test-split-data-activity-based.R @@ -87,14 +87,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) + lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) names(actual) = names(expected.config) @@ -170,13 +172,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 18, split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -269,15 +272,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2010-07-12 11:05:35", "2010-07-12 12:05:41", + "2010-07-12 12:05:44" ,"2016-07-12 15:58:40", "2016-07-12 16:05:37", + "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2010-07-12 11:05:35", "2010-07-12 12:05:41", - "2010-07-12 12:05:44" ,"2016-07-12 15:58:40", "2016-07-12 16:05:37", - "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -373,13 +377,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 26, split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -472,15 +477,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2013-05-25 06:22:23", "2016-07-12 15:59:59", + "2016-07-12 16:06:30", "2016-08-07 15:37:02", "2017-05-23 12:31:34", + "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", split.length = 9, split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2013-05-25 06:22:23", "2016-07-12 15:59:59", - "2016-07-12 16:06:30", "2016-08-07 15:37:02", "2017-05-23 12:31:34", - "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -576,13 +582,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", split.length = 67, split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -675,14 +682,15 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "commits", split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -766,13 +774,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 18, split.basis = "commits", split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -869,15 +878,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:32", + "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "commits", split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:32", - "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1000,16 +1010,17 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2005-02-09 18:49:49", "2010-07-12 11:05:35", + "2010-07-12 12:05:34", "2010-07-12 12:05:41", "2010-07-12 12:05:42", + "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2016-07-12 15:58:40", + "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "mails", split.sliding.window = TRUE, - split.revisions = c("2004-10-09 18:38:13", "2005-02-09 18:49:49", "2010-07-12 11:05:35", - "2010-07-12 12:05:34", "2010-07-12 12:05:41", "2010-07-12 12:05:42", - "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2016-07-12 15:58:40", - "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1129,13 +1140,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 26, split.basis = "mails", split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less - split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1232,16 +1244,17 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2013-05-06 01:04:34", "2013-05-25 06:22:23", + "2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 16:02:02", + "2016-07-12 16:06:30", "2016-07-27 20:12:08", "2016-08-07 15:37:02", + "2016-10-05 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", split.length = 9, split.basis = "issues", split.sliding.window = TRUE, - split.revisions = c("2013-04-21 23:52:09", "2013-05-06 01:04:34", "2013-05-25 06:22:23", - "2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 16:02:02", - "2016-07-12 16:06:30", "2016-07-27 20:12:08", "2016-08-07 15:37:02", - "2016-10-05 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1361,13 +1374,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", split.length = 67, split.basis = "issues", split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less - split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1459,13 +1473,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:20", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 4, split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:20", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1585,13 +1600,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2010-07-12 12:05:43", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 8, split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2010-07-12 12:05:43", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1712,13 +1728,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2016-07-12 16:03:59", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", split.length = 24, split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2016-07-12 16:03:59", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) diff --git a/tests/test-split-data-time-based.R b/tests/test-split-data-time-based.R index 7d115ab4a..f315f91d5 100644 --- a/tests/test-split-data-time-based.R +++ b/tests/test-split-data-time-based.R @@ -87,13 +87,14 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:59", "2016-07-12 16:04:59", "2016-07-12 16:06:33") expected.config = list( split.type = "time-based", split.length = "3 min", split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:59", "2016-07-12 16:04:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -198,14 +199,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2007-10-10 12:38:13", "2010-10-10 06:38:13", "2013-10-10 00:38:13", + "2016-07-12 16:05:38") expected.config = list( split.type = "time-based", split.length = "3 years", split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2007-10-10 12:38:13", "2010-10-10 06:38:13", - "2013-10-10 00:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -314,13 +316,14 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2015-04-22 11:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "time-based", split.length = "2 years", split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2015-04-22 11:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -426,15 +429,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:29", "2016-07-12 16:01:59", "2016-07-12 16:03:29", + "2016-07-12 16:04:59", "2016-07-12 16:06:29", "2016-07-12 16:06:33") expected.config = list( split.type = "time-based", split.length = "3 min", split.basis = "commits", split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:29", "2016-07-12 16:01:59", - "2016-07-12 16:03:29", "2016-07-12 16:04:59", "2016-07-12 16:06:29", - "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -554,15 +557,16 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2006-04-10 15:38:13", "2007-10-10 12:38:13", "2009-04-10 09:38:13", + "2010-10-10 06:38:13", "2012-04-10 03:38:13", "2013-10-10 00:38:13", "2015-04-10 21:38:13", + "2016-07-12 16:05:38") expected.config = list( split.type = "time-based", split.length = "3 years", split.basis = "mails", split.sliding.window = TRUE, - split.revisions = c("2004-10-09 18:38:13", "2006-04-10 15:38:13", "2007-10-10 12:38:13", - "2009-04-10 09:38:13", "2010-10-10 06:38:13", "2012-04-10 03:38:13", - "2013-10-10 00:38:13", "2015-04-10 21:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -692,14 +696,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2014-04-22 05:52:09", "2015-04-22 11:52:09", "2016-04-21 17:52:09", + "2017-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "time-based", split.length = "2 years", split.basis = "issues", split.sliding.window = TRUE, - split.revisions = c("2013-04-21 23:52:09", "2014-04-22 05:52:09", "2015-04-22 11:52:09", - "2016-04-21 17:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -807,13 +812,14 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... ) info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59") expected.config = list( split.type = "time-based", - split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), + split.length = revisions, split.basis = NULL, split.sliding.window = FALSE, - split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -903,13 +909,14 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... , info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03") expected.config = list( split.type = "time-based", - split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), + split.length = revisions, split.basis = NULL, split.sliding.window = FALSE, - split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1044,15 +1051,15 @@ patrick::with_parameters_test_that("Split a data object time-based using custom info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", "2016-08-08 00:00:00", + "2016-10-05 09:00:00") expected.config = list( split.type = "time-based", - split.length = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", - "2016-08-08 00:00:00", "2016-10-05 09:00:00"), + split.length = revisions, split.basis = NULL, split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", - "2016-08-08 00:00:00", "2016-10-05 09:00:00"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1283,14 +1290,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information - + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:30", "2016-07-12 16:04:01", "2016-07-12 16:06:33") expected.config = list( split.type = "time-based", split.length = "2M 31S", split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:30", "2016-07-12 16:04:01", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1394,14 +1401,15 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2007-09-18 06:00:04", "2010-08-26 17:21:55", "2013-08-04 04:43:46", + "2016-07-12 16:05:38") expected.config = list( split.type = "time-based", split.length = "2y 0m 342d 23H 21M 51S", split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2007-09-18 06:00:04", "2010-08-26 17:21:55", - "2013-08-04 04:43:46", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1510,13 +1518,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2014-09-01 12:05:39", "2016-01-12 00:19:09", "2017-05-23 12:32:40") expected.config = list( split.type = "time-based", split.length = "1y 0m 132d 6H 13M 30S", split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2014-09-01 12:05:39", "2016-01-12 00:19:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) From 93051ab848ec94de138b0513dac22f6da0d20885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Wed, 31 Jan 2024 14:48:11 +0100 Subject: [PATCH 09/13] Ensure that 'bins' attribute is POSIXct and correctly set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure that the 'bins' attribute of network-splits is consistently of type POSIXct. Further, ensure that it is always present upon calling any network splitting function. Add correctness checks to tests. Signed-off-by: Maximilian Löffler --- tests/test-split-data-activity-based.R | 2 +- tests/test-split-network-activity-based.R | 36 +++++++++++++++++++++++ tests/test-split-network-time-based.R | 26 ++++++++++++++++ util-split.R | 14 ++++++--- 4 files changed, 73 insertions(+), 5 deletions(-) diff --git a/tests/test-split-data-activity-based.R b/tests/test-split-data-activity-based.R index 363405b60..f0c2812cf 100644 --- a/tests/test-split-data-activity-based.R +++ b/tests/test-split-data-activity-based.R @@ -19,7 +19,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, activity-based splitting of data.") diff --git a/tests/test-split-network-activity-based.R b/tests/test-split-network-activity-based.R index 472f91b71..5c9036416 100644 --- a/tests/test-split-network-activity-based.R +++ b/tests/test-split-network-activity-based.R @@ -72,6 +72,11 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:05:41", "2016-07-12 16:06:10", + "2016-07-12 16:06:32", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -91,6 +96,10 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -112,6 +121,11 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:05:41", "2016-07-12 16:06:32", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -171,6 +185,12 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", + "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:10", + "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -191,6 +211,10 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -212,6 +236,11 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:05:41", "2016-07-12 16:06:32", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -269,6 +298,13 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", + "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:10", + "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2020-02-20 20:20:20", + "2020-02-20 20:20:21")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) diff --git a/tests/test-split-network-time-based.R b/tests/test-split-network-time-based.R index bdcd21d35..18b4be51d 100644 --- a/tests/test-split-network-time-based.R +++ b/tests/test-split-network-time-based.R @@ -16,6 +16,7 @@ ## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2022 by Jonathan Baumann +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, time-based splitting of networks.") @@ -76,6 +77,12 @@ patrick::with_parameters_test_that("Split a network time-based (time.period = .. ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:59", + "2016-07-12 16:02:59", "2016-07-12 16:04:59", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -195,6 +202,13 @@ patrick::with_parameters_test_that("Split a network time-based (time.period = .. ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 15:59:59", + "2016-07-12 16:00:59", "2016-07-12 16:01:59", + "2016-07-12 16:02:59", "2016-07-12 16:03:59", + "2016-07-12 16:04:59", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -259,6 +273,12 @@ patrick::with_parameters_test_that("Split a network time-based (bins = ...), ", ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:00", "2016-07-12 16:00:59", + "2016-07-12 16:02:59", "2016-07-12 16:04:59", + "2016-07-12 17:21:43")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -366,6 +386,12 @@ patrick::with_parameters_test_that("Split a network time-based with equal-sized ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:53", + "2016-07-12 16:02:47", "2016-07-12 16:04:41", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) diff --git a/util-split.R b/util-split.R index f76fa66b4..597dc4000 100644 --- a/util-split.R +++ b/util-split.R @@ -292,7 +292,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits", project.conf.new = project.conf.new) ## extract bins - bins.date.middle = attr(cf.data.sliding, "bins") + bins.date.middle = get.date.string(attr(cf.data.sliding, "bins")) ## Both, the last sliding range and the last regular range end at the very last item. ## This is the case because the end of the data is never cropped (like the beginning is). @@ -732,7 +732,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win sliding.window = FALSE) ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = attr(networks.sliding, "bins") + bins.date.middle = get.date.string(attr(networks.sliding, "bins")) ## Both, the last sliding network and the last regular network end at the very last edge. ## This is the case because the end of the edges is never cropped (like the beginning is). @@ -769,7 +769,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win } ## set bin attribute - attr(networks, "bins") = bins.date + attr(networks, "bins") = get.date.from.string(bins.date) ## set ranges as names revs = get.date.string(bins.date) @@ -814,6 +814,12 @@ split.network.time.based.by.ranges = function(network, ranges, remove.isolates = } ) + ## convert ranges to bins + bins.starts = sapply(ranges.bounds, function(range) range[1]) + bins.end = ranges.bounds[[length(ranges.bounds)]][2] + bins.date = get.date.from.unix.timestamp(c(bins.starts, bins.end)) + + attr(nets.split, "bins") = bins.date return(nets.split) } @@ -857,7 +863,7 @@ split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, r }) ## set 'bins' attribute, if specified if (!is.null(bins.date)) { - attr(nets, "bins") = bins.date + attr(nets, "bins") = get.date.from.string(bins.date) } logging::logdebug("split.network.by.bins: finished.") return(nets) From 2497e231a052054f61f1f82fa795b8b64e70de7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Tue, 13 Feb 2024 19:01:51 +0100 Subject: [PATCH 10/13] Adhere to coding conventions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add single quotes around coding variables in comments, improve description of return value for 'get.bin.dates.from.ranges', and move said method closer to similar methods in file for consistency. Signed-off-by: Maximilian Löffler --- util-misc.R | 20 ++++++++++---------- util-networks.R | 7 ++++--- util-split.R | 4 ++-- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/util-misc.R b/util-misc.R index d6ced669c..3720d890b 100644 --- a/util-misc.R +++ b/util-misc.R @@ -984,6 +984,16 @@ get.range.bounds = function(range) { return (range) } +#' Obtain the start and end dates from given ranges. +#' +#' @param ranges the ranges to get the dates from +#' +#' @return a vector that contains the start and end dates of all given ranges +#' sorted and disambiguated +get.bin.dates.from.ranges = function(ranges) { + dates = sort(unique(get.date.from.unix.timestamp(unlist(ranges)))) + return(dates) +} #' Get the data from a data frame in a specific range. #' @@ -1012,13 +1022,3 @@ get.data.from.range = function(range, data) { } } -#' Obtain the start and end dates from given ranges. -#' -#' @param ranges the ranges to get the dates from -#' -#' @return a sorted vector of all the start the end dates of the given ranges -get.bin.dates.from.ranges = function(ranges) { - dates = sort(unname(unique(get.date.from.unix.timestamp(unlist(ranges))))) - return(dates) -} - diff --git a/util-networks.R b/util-networks.R index 891d50402..cf60e1835 100644 --- a/util-networks.R +++ b/util-networks.R @@ -497,7 +497,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## we would need to guess the correct direction. if (!private$network.conf$get.entry("artifact.directed")) { - ## obtain add_link events from jira + ## obtain 'add_link' events from jira jira.add.links = add.links[add.links$issue.source == "jira", ] matched = c() @@ -506,10 +506,11 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", add.link = jira.add.links[i, ] if (all(add.link %in% matched)) { + ## make sure to not remove both duplicate edges next } - ## match any add_link events, that are the reverse direction of 'add.link', + ## match any 'add_link' events, that are the reverse direction of 'add.link', ## but the same timestamp and author information. match = jira.add.links[( jira.add.links$issue.id == add.link$event.info.1 & @@ -517,7 +518,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", jira.add.links$date == add.link$date & jira.add.links$author.name == add.link$author.name), ] - ## if a match is found, remove 'add.link' and its corresponding referenced_by event + ## if a match is found, remove 'add.link' and its corresponding 'referenced_by' event if (nrow(match) > 0) { add.links = add.links[!( add.links$issue.id == match$issue.id & diff --git a/util-split.R b/util-split.R index 597dc4000..928ad5bbd 100644 --- a/util-split.R +++ b/util-split.R @@ -296,7 +296,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## Both, the last sliding range and the last regular range end at the very last item. ## This is the case because the end of the data is never cropped (like the beginning is). - ## split.data.activity.based, which is invoked to obtain both set of ranges, creates + ## 'split.data.activity.based', which is invoked to obtain both set of ranges, creates ## ranges until all elements are in one. ## ## The conditional below inspects whether the very last item is in the first or the second @@ -736,7 +736,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## Both, the last sliding network and the last regular network end at the very last edge. ## This is the case because the end of the edges is never cropped (like the beginning is). - ## Both split.network.activity.based, and split.network.by.bins, which are invoked to obtain + ## Both 'split.network.activity.based', and 'split.network.by.bins', which are invoked to obtain ## the two set of networks, creates networks until all edges are contained. ## ## The conditional below inspects whether the very last edge is in the first or the second From ed77bd726bf92e06c2fc9145a5847787a8d0588b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Tue, 13 Feb 2024 19:05:06 +0100 Subject: [PATCH 11/13] Fix JIRA issue deduplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a bug that could lead to the removal of both duplicates in the initial deduplication of JIRA issue data, that was caused by the usage of an inappropriate data structure to remember already handled issues. Signed-off-by: Maximilian Löffler --- util-networks.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/util-networks.R b/util-networks.R index cf60e1835..1068cb998 100644 --- a/util-networks.R +++ b/util-networks.R @@ -499,19 +499,20 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## obtain 'add_link' events from jira jira.add.links = add.links[add.links$issue.source == "jira", ] - matched = c() + matched = list() ## iterate over all add_link events from jira for (i in 1:nrow(jira.add.links)) { add.link = jira.add.links[i, ] - if (all(add.link %in% matched)) { - ## make sure to not remove both duplicate edges + + ## ensure not to remove both duplicate edges + if (any(sapply(matched, function(entry) identical(entry, add.link)))) { next } ## match any 'add_link' events, that are the reverse direction of 'add.link', - ## but the same timestamp and author information. + ## but have the same timestamp and author information match = jira.add.links[( jira.add.links$issue.id == add.link$event.info.1 & jira.add.links$event.info.1 == add.link$issue.id & @@ -524,13 +525,13 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", add.links$issue.id == match$issue.id & add.links$event.info.1 == match$event.info.1 & add.links$date == match$date & - add.links$author.name == add.link$author.name), ] + add.links$author.name == match$author.name), ] referenced.bys = referenced.bys[!( referenced.bys$issue.id == add.link$issue.id & referenced.bys$event.info.1 == add.link$event.info.1 & referenced.bys$date == add.link$date & referenced.bys$author.name == add.link$author.name), ] - matched = c(match, add.link) + matched = append(matched, list(match)) } } } From 749b0b8a1cb0fc1e4d0cb03af39e0b550549d91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Tue, 27 Feb 2024 12:02:31 +0100 Subject: [PATCH 12/13] Adjust Copyright Headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- tests/test-networks-artifact.R | 2 +- tests/test-networks-author.R | 2 +- tests/test-read.R | 2 +- tests/test-split-data-time-based.R | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index 88d77bb11..253e08ba5 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -15,7 +15,7 @@ ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Jakob Kronawitter -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler ## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index 4de1460d9..d4d0e9faa 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -21,7 +21,7 @@ ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. diff --git a/tests/test-read.R b/tests/test-read.R index 4a7f000e3..db3645d4d 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -21,7 +21,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann -## Copyright 2022-2023 by Maximilian Löffler +## Copyright 2022-2024 by Maximilian Löffler ## All Rights Reserved. diff --git a/tests/test-split-data-time-based.R b/tests/test-split-data-time-based.R index f315f91d5..67945105d 100644 --- a/tests/test-split-data-time-based.R +++ b/tests/test-split-data-time-based.R @@ -20,7 +20,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, time-based splitting of data.") From d7c1de6ccfe89f94e983f5f08cf27bea8337eb32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Tue, 27 Feb 2024 15:29:03 +0100 Subject: [PATCH 13/13] Update 'NEWS.md' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- NEWS.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index a54b974bf..c0b0dd305 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,15 +6,16 @@ ### Added -- Add issue-based artifact-networks (PR #244, 98a93ee721a293410623aafe46890cfba9d81e72, 771bcc8d961d419b53a1e891e9dc536371f1143b, 368e79264adf5a5358c04518c94ad2e1c13e212b) +- Add issue-based artifact-networks, in which issues form vertices connected by edges that represent issue references. If possible, disambiguate duplicate JIRA issue references that originate from [codeface-extraction](https://github.com/se-sic/codeface-extraction) (PR #244, PR #249, 98a93ee721a293410623aafe46890cfba9d81e72, 771bcc8d961d419b53a1e891e9dc536371f1143b, 368e79264adf5a5358c04518c94ad2e1c13e212b, fa3167c289c9785f3a5db03d9724848f1441a63d, 4646d581d5e1f63260692b396a8bd8f51b0da48fda, ed77bd726bf92e06c2fc9145a5847787a8d0588b) - Add a new `split.data.by.bins` function (not to be confused with a previously existing function that had the same name and was renamed in this context), which splits data based on given activity-based bins (PR #244, ece569ceaf557bb38cd0cfad437b69b30fe8a698, ed5feb214a123b605c9513262f187cfd72b9e1f4) - Add new `assert.sparse.matrices.equal` function to compare two sparse matrices for equality for testing purposes (PR #248, 9784cdf12d1497ee122e2ae73b768b8c334210d4, d9f1a8d90e00a634d7caeb5e7f8f262776496838) -- Add tests for file `util-networks.misc.R` for issue #242 (PR #248, f3202a6f96723d11c170346556d036cf087521c8, 030574b9d0f3435db4032d0e195a3d407fb7244b, 380b02234275127297fcd508772c69db21c216de, 8b803c50d60fc593e4e527a08fd4c2068d801a48, 7335c3dd4d0302b024a66d18701d9800ed3fe806, 6b600df04bec1fe70c272604f274ec5309840e65) +- Add tests for file `util-networks-misc.R` for issue #242 (PR #248, f3202a6f96723d11c170346556d036cf087521c8, 030574b9d0f3435db4032d0e195a3d407fb7244b, 380b02234275127297fcd508772c69db21c216de, 8b803c50d60fc593e4e527a08fd4c2068d801a48, 7335c3dd4d0302b024a66d18701d9800ed3fe806, 6b600df04bec1fe70c272604f274ec5309840e65) - Add the possibility to simplify edges of multiple-relation networks into a single edge at all instead of a single edge per relation (PR #250, 2105ea89b5227e7c9fa78fea9de1977f2d9e8faa) +- Add `get.bin.dates.from.ranges` function to convert date ranges into bins format (PR #249, a1842e9be46596321ee86860fd87d17a3c88f50f, 858b1812ebfc3194cc6a03c99f3ee7d161d1ca15) ### Changed/Improved -- Enhance testing data by adding `add_link` and `referenced_by` issue events which connect issues to form edges in issue-based artifact-networks (PR #244, 9f840c040d552e8639aa82c3dd537c189679b348, ea4fe8d3c84f948af6147cf0137e80181ebb7a1e) +- Enhance testing data by adding `add_link` and `referenced_by` issue events, which connect issues to form edges in issue-based artifact-networks. This includes duplicate edge information in JIRA data as produced by [codeface-extraction](https://github.com/se-sic/codeface-extraction) (PR #244, 9f840c040d552e8639aa82c3dd537c189679b348, ea4fe8d3c84f948af6147cf0137e80181ebb7a1e, 6eb731102301b1af08f4affb40d1f8df94500e34) - Add input validation for the `bins` parameter in `split.data.time.based` and `split.data.by.bins` (PR #244, ed0a5302ea8c8934d7200b95be7ac1446305af07, 5e5ecbac44d07927b953ae9d4330a616f8224ba7) - Rename `split.data.by.bins` into `split.dataframe.by.bins` as this it what it does (PR #244, ed5feb214a123b605c9513262f187cfd72b9e1f4) - Enhance `get.author.names.from.network` and `get.author.names.from.data` to always have the same output format. Now it doesn't depend on the `global` flag anymore (PR #248, d87d32564156f13c83ebe3361c2b68e5d0ac16ac, ddbfe68d3e628e82f34e09b36fffe886646986c5) @@ -22,12 +23,14 @@ - Throw an error in `convert.adjacency.matrix.list.to.array` if the function is called with wrong parameters (PR #248, ece2d38b4972745af3a83e06f32317a06465a345, 1a3e510df15f5fa4e920e9fce3e0e162c27cd6d1) - Rename `compare.networks` to `assert.networks.equal` to better match the purpose of the function (PR #248, d9f1a8d90e00a634d7caeb5e7f8f262776496838) - Explicitly add R version 4.3 to the CI test pipeline (9f346d5bc3cfc553f01e5e80f0bbe51e1dc2b53e) +- Simplify call chain-, and branching-routes in network-splitting functions and consequently set the `bins` attribute on every output network-split (while minimizing recalculations) (PR #249, a1842e9be46596321ee86860fd87d17a3c88f50f) +- Test for the presence and validity of the `bins` attribute on network-, and data-splits (PR #249, c064affcfff2eb170d8bdcb39d837a7ff62b2cbd, 93051ab848ec94de138b0513dac22f6da0d20885) ### Fixed - Reformat `event.info.1` column of issue data according to the format, if the content of the `event.info.1` field references another issue (PR #244, 62ff9d0f31adbefb3381936237dc4ab984e33acb) - Fix an issue in activity-based splitting where elements close to the border of bins might be assigned to the wrong bin. The issue was caused by the usage of `split.data.time.based` inside `split.data.activity.based` to split data into the previously derived bins, when elements close to bin borders share the same timestamps. It is fixed by replacing `split.data.time.based` by `split.data.by.bins` (PR #244, ece569ceaf557bb38cd0cfad437b69b30fe8a698) -- Remove the last range when using a sliding-window approach and the last range's elements are fully contained in the second last range (PR #244, 48ef4fa685adf6e5d85281e5b90a8ed8f6aeb197) +- Remove the last range when using a sliding-window approach and the last range's elements are fully contained in the second last range (PR #244, 48ef4fa685adf6e5d85281e5b90a8ed8f6aeb197, 943228fbc91eed6854dacafa7075441e58b22675) - Rename vertex attribute `IssueEvent` to `Issue` in multi-networks, to be consistent with bipartite-networks (PR #244, 26d7b7e9fd6d33d1c0a8a08f19c5c2e30346a3d9) - Fix `get.expanded.adjacency` to work if the provided author list does not contain all authors from network and add a warning when that happens since it causes some authors from the network to be lost in the resulting matrix (PR #248, ff59017e114b10812dcfb1704a19e01fc1586a13) - Fix `get.expanded.adjacency.matrices` to have correct names for the columns and rows (PR #248, e72eff864a1cb1a4aecd430e450d4a6a5044fdf2)