From ece569ceaf557bb38cd0cfad437b69b30fe8a698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Mon, 4 Sep 2023 22:40:26 +0200 Subject: [PATCH] Add 'split.data.by.bins.vector' and fix miscellaneous bugs in splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify 'split.data.time.based' to be able to split by activity-based bins. Rename the function to 'split.data.by.time.or.bins'. Introduce wrapper functions 'split.data.by.bins.vector' and 'split.data.time.based' to call 'split.data.by.time.or.bins'. Add 'include.duplicate.ids' parameter in 'split.get.bins.activity.based' to obtain bins covering all data elements from 'df' by which the split is being performed, regardless of the elements ids uniqueness. In 'split.data.activity.based', after calculating the bins to place data elements into, replace the time-based splitting by 'split.data.by.bins.vector'. Time-based splitting is incorrect for the case that the date of the last element in a bin is the same as the date of the first element of the next bin. Adjust calculation of 'offset.end' in 'split.data.activity.based' to fix a bug where because of a short last window the end offset would cross the border of the last window, overlapping into the second last. Because of this overlap the last sliding windows would not be calculated as expected. This works towards #239. Signed-off-by: Maximilian Löffler --- util-split.R | 134 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 107 insertions(+), 27 deletions(-) diff --git a/util-split.R b/util-split.R index 1c0ea9e9..36e27c97 100644 --- a/util-split.R +++ b/util-split.R @@ -22,6 +22,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -63,6 +64,52 @@ requireNamespace("lubridate") # for date conversion split.data.time.based = function(project.data, time.period = "3 months", bins = NULL, number.windows = NULL, split.basis = c("commits", "mails", "issues"), sliding.window = FALSE, project.conf.new = NULL) { + split = split.data.by.time.or.bins(project.data, splitting.length = time.period, bins, split.by.time = TRUE, + number.windows, split.basis, sliding.window, project.conf.new) + return(split) +} + +#' Split project data in activity-bin-based ranges as specified +#' +#' @param project.data the *Data object from which the data is retrieved +#' @param activity.amount the amount of data elements with unique ids to be considered in a bin, an integer. +#' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an +#' *exclusive* manner), augmented with a bin vector mapping unique ids to bins. +#' [default: NULL] +#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' [default: "commits"] +#' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}. +#' +#' @return the list of RangeData objects, each referring to one bin +split.data.by.bins.vector = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"), + sliding.window) { + split = split.data.by.time.or.bins(project.data, activity.amount, bins, split.by.time = FALSE, + sliding.window = sliding.window, split.basis = split.basis) + return(split) +} + +#' Split project data in time-based or activity-bin-based ranges as specified +#' +#' @param project.data the *Data object from which the data is retrieved +#' @param splitting.length either \code{time.period} from \code{split.data.time.based} +#' or \code{splitting.length} from\code{split.data.by.bins.vector} +#' @param bins either \code{bins} from \code{split.data.time.based} +#' or \code{bins} from\code{split.data.by.bins.vector} +#' @param split.by.time logical indicating whether splitting is done time-based or by activity-bins-based, +#' @param number.windows see \code{number.windows} from \code{split.data.time.by.bins.vector} +#' [default: NULL] +#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' [default: "commits"] +#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach +#' [default: FALSE] +#' @param project.conf.new the new project config to construct the \code{RangeData} objects. +#' If \code{NULL}, a clone of \code{project.data$get.project.conf()} will be used. +#' [default: NULL] +#' +#' @return the list of RangeData objects, each referring to one time period +split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time, + number.windows = NULL, split.basis = c("commits", "mails", "issues"), + sliding.window = FALSE, project.conf.new = NULL) { ## get basis for splitting process split.basis = match.arg(split.basis) @@ -99,26 +146,32 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = ## remove sliding windows sliding.window = FALSE } + + ## initiate variable + split.by.bins = FALSE + ## if bins are NOT given explicitly if (is.null(bins)) { ## get bins based on split.basis - bins = split.get.bins.time.based(data[[split.basis]][["date"]], time.period, number.windows)$bins + bins = split.get.bins.time.based(data[[split.basis]][["date"]], splitting.length, number.windows)$bins bins.labels = head(bins, -1) - split.by.bins = FALSE ## logging logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.", - project.data$get.class.name(), time.period, split.basis) + project.data$get.class.name(), splitting.length, split.basis) } - ## when bins are given explicitly + ## when bins are given explicitly, get bins based on parameter else { - ## remove sliding windows - sliding.window = FALSE - ## get bins based on parameter - split.basis = NULL - bins = get.date.from.string(bins) - bins = get.date.string(bins) + if (split.by.time) { + split.basis = NULL + split.by.bins = TRUE + sliding.window = FALSE + bins = get.date.from.string(bins) + bins = get.date.string(bins) + } else { + bins.vector = bins[["vector"]] + bins = bins[["bins"]] + } bins.labels = head(bins, -1) - split.by.bins = TRUE ## logging logging::loginfo("Splitting data '%s' into time ranges [%s].", project.data$get.class.name(), paste(bins, collapse = ", ")) @@ -129,7 +182,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = bins.ranges = construct.ranges(bins) names(bins.ranges) = bins.ranges - if ((length(bins.ranges) <= 1) && sliding.window) { + if (split.by.time && (length(bins.ranges) <= 1) && sliding.window) { logging::logwarn("Sliding-window approach does not apply for one range or less.") sliding.window = FALSE } @@ -140,13 +193,16 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = project.conf.new = project.data$get.project.conf()$clone() } - if (!sliding.window) { + if (!sliding.window || !split.by.time) { ## split data data.split = parallel::mclapply(data.to.split, function(df.name) { logging::logdebug("Splitting %s.", df.name) ## identify bins for data df = data[[df.name]] - df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE) + df.bins = if (!split.by.time && (df.name == split.basis)) + bins.vector + else + findInterval(df[["date"]], bins.date, all.inside = FALSE) ## split data according to df.bins df.split = split(df, df.bins) ## add proper labels/names @@ -192,10 +248,10 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = ## perform different steps for sliding-window approach ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = FALSE, + time.period = splitting.length, overlap = 0.5, raw = FALSE, include.end.date = FALSE) # bins have already been prepared correctly bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = TRUE, + time.period = splitting.length, overlap = 0.5, raw = TRUE, include.end.date = FALSE) # bins have already been prepared correctly bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) bins = get.date.string(bins.date) @@ -214,7 +270,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = ## add splitting information to project configuration project.conf.new$set.splitting.info( - type = "time-based", + type = if (split.by.time) "time-based" else "activity-based", length = if (split.by.bins) { bins } @@ -228,8 +284,8 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = ) )) } - else time.period - }, + else splitting.length + }, basis = split.basis, sliding.window = sliding.window, revisions = bins, @@ -363,14 +419,14 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## get bins based on split.basis logging::logdebug("Getting activity-based bins.") bins.data = split.get.bins.activity.based(data[[activity.type]], id.column[[activity.type]], - activity.amount, remove.duplicate.bins = TRUE) + activity.amount, remove.duplicate.bins = TRUE, include.duplicate.ids = TRUE) bins = bins.data[["bins"]] bins.date = get.date.from.string(bins) ## split the data based on the extracted timestamps logging::logdebug("Splitting data based on time windows arising from activity bins.") - cf.data = split.data.time.based(project.data, bins = bins.date, split.basis = activity.type, - project.conf.new = project.conf.new) + cf.data = split.data.by.bins.vector(project.data, bins = bins.data, activity.amount = activity.amount, + sliding.window = sliding.window, split.basis = activity.type) ## perform additional steps for sliding-window approach: ## for activity-based sliding-window bins to work, we need to crop the data appropriately and, @@ -387,6 +443,13 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## offsets used for cropping (half the first/last bin) offset.start = floor(activity.amount / 2) offset.end = (items.unique.count - offset.start) %% activity.amount + + # make sure that end offset does not go above one window + last.window = cf.data[[length(cf.data)]][[DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]]]() + length.of.last.window = length(unique(last.window[[ id.column[[activity.type]] ]])) + + offset.end = max(c(length.of.last.window - offset.start, 0)) + ## cut the data appropriately if (offset.end > 0) { items.cut = c( @@ -435,7 +498,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits", ## and the data of the last regular range is contained in the last sliding-window range, then: ## remove the last regular range as it is not complete and we don't loose data when removing it last.regular.range = cf.data[[length(cf.data)]] - last.sliding.range = cf.data[[length(cf.data) - 1]] + last.sliding.range = cf.data.sliding[[length(cf.data.sliding) - 1]] get.activity.data = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]] last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] @@ -1102,13 +1165,18 @@ split.get.bins.time.based = function(dates, time.period, number.windows = NULL) #' @param activity.amount the amount of activity denoting the number of unique items #' in each split bin [default: 5000] #' @param remove.duplicate.bins remove duplicate bin borders? [default: FALSE] +#' @param include.duplicate.ids include entries of the \code{df} with non-unique ids +#' in the creation of the bins. This should! not change bin borders +#' as entries with the same id should! share the same \code{date} attribute. +#' [default: FALSE] #' #' @return a list, -#' the item 'vector': the bins each row in 'df' belongs to (increasing integers), +#' the item 'vector': the bins each row in 'df' belongs to (increasing integers),q #' the item 'bins': the bin labels, described by dates, each bin containing -#' 'acitivity.amount' many unique items; each item in the vector indicates +#' 'activity.amount' many unique items; each item in the vector indicates #' the start of a bin, although the last item indicates the end of the last bin -split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE) { +split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE, + include.duplicate.ids = FALSE) { logging::logdebug("split.get.bins.activity.based: starting") ## get the unique integer IDs for each item in 'id' column ids = df[[id]] @@ -1120,11 +1188,23 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica if (bins.number.complete != 0) rep(seq_len(bins.number.complete), each = activity.amount), rep(bins.number.complete + 1, bins.number.incomplete) ) + + ## pad bins with entries for all duplicate ids + if (include.duplicate.ids) { + bins.activity.padded = c() + for (i in seq_along(ids)) { + ## create an extra entry for every duplicate id in the same bin as + ## the first occurance of the id + current.bin = bins.activity[ which(ids.unique == ids[i]) ] + bins.activity.padded = c(bins.activity.padded, current.bin) + } + bins.activity = bins.activity.padded + } bins.number = max(bins.activity) ## join ids and bin numbers bins.mapping = data.frame( - id = ids.unique, + id = if (include.duplicate.ids) ids else ids.unique, bin = bins.activity )