Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow time-based data-splitting with multiple datasources as 'split.basis' #261

Merged
merged 8 commits into from
May 27, 2024
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Add commit-interaction networks that can be created with `create.author.network` and `create.artifact.network` if the `artifact.relation` and `author.relation` is configured to be `commit.interaction` (PR #252, d82857fbebd1111bb16588a4223bb24a8dcd07de, 329d97ec3de36a9e1bcadc0c7a53c1d92e8b481c) as well as tests for these features (PR #252, 07e7ed744209b0251217fa8f7f35d9b9875face2, 7068cfa10d993dcae3f5e3f76f8cafa99fa8b350)
- Add helper function for prefixing function names with file names in `util-read.R` (PR #252, f8ea987b138173cf0509c7910e0572d8ee1b3f1f)
- Add line-based code coverage reports into CI pipeline. Coverage reports are generated by `coverage.R` (PR #262, 10cac49d005e87c3964cc61711e7f5acef749626, b3b9f4ac7a9911bd00293c68fac88e0f9033bdfb, c815d18dc6266d620a7a145493417b87ac08679e, e8093525fdaf46e54f2f7fcc6358ca7892e795e5, 32d04823e2007c63d2a43ce59bea3057327c19a7)
- Add the possibility to split data time-based by multiple data sources (PR #261, 1088395f46b84028c8d7c463ca86b5dc38500c26, e1f79fc9e40cd6f41c946be42db364b2101cfe10, 0bb187fec0fd801d7634bf8d5180525770f6ab0b, 371a97ac6ebf3de4fe9360dea79d62e2ed3ef585)

### Changed/Improved

Expand Down
10 changes: 10 additions & 0 deletions showcase.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ run.lapply(data, "get.data.path.callgraph")

## * Data-based splitting --------------------------------------------------

## split time-based using commits as the data source to split by (with sliding windows)
cf.data = split.data.time.based(x.data, time.period = "18 months", split.basis = "commits", sliding.window = TRUE)
for (range in names(cf.data)) {
y.data = cf.data[[range]]
Expand All @@ -289,6 +290,15 @@ for (range in names(cf.data)) {
}
print(run.lapply(cf.data, "get.class.name"))

## split time-based using commits and issues as the data sources to split by (without sliding windows)
cf.data = split.data.time.based(x.data, time.period = "18 month", split.basis = c("commits", "issues"))
bockthom marked this conversation as resolved.
Show resolved Hide resolved
for (range in names(cf.data)) {
y.data = cf.data[[range]]
y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf)
plot.network(y$get.bipartite.network())
}
print(run.lapply(cf.data, "get.class.name"))

mybins = c("2012-07-10 15:58:00", "2012-07-15 16:02:00", "2012-07-20 16:04:00", "2012-07-25 16:06:30")
cf.data = split.data.time.based(x.data, bins = mybins)
for (range in names(cf.data)) {
Expand Down
59 changes: 59 additions & 0 deletions tests/test-split-data-time-based.R
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,65 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis
"pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE)
))


##
## Tests for split.data.time.based(..., split.basis = c('mails', 'issues'), with and without sliding windows
##

patrick::with_parameters_test_that("Split a data object time-based (split.basis = c('mails', 'issues'))", {

## configuration objects
proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT)
net.conf = NetworkConf$new()

## data object
project.data = ProjectData$new(proj.conf)

# remove really old mail data
mail.data = project.data$get.mails()
mail.data = mail.data[-(1:12), ]
project.data$set.mails(mail.data)

# check mail date bounds
expect_equal(min(mail.data$date), as.POSIXct("2016-07-12 15:58:40"))
expect_equal(max(mail.data$date), as.POSIXct("2016-07-12 16:05:37"))

# keep issue data that roughly overlaps the mail data
issue.data = project.data$get.issues()
issue.data = issue.data[-(1:12), ]
issue.data = issue.data[-(8:12), ]
project.data$set.issues(issue.data)

# check issue date bounds
expect_equal(min(issue.data$date), as.POSIXct("2016-07-12 15:59:25"))
expect_equal(max(issue.data$date), as.POSIXct("2016-07-12 16:06:01"))

# split by 'mails' and 'issues'
results = split.data.time.based(project.data, time.period = "1 min",
split.basis = c("mails", "issues"), sliding.window = test.sliding.window)

# define bins for 'test.sliding.window' = TRUE
expected.bins = get.date.from.string(c("2016-07-12 15:58:40", "2016-07-12 15:59:10", "2016-07-12 15:59:40",
"2016-07-12 16:00:10", "2016-07-12 16:00:40", "2016-07-12 16:01:10",
"2016-07-12 16:01:40", "2016-07-12 16:02:10", "2016-07-12 16:02:40",
"2016-07-12 16:03:10", "2016-07-12 16:03:40", "2016-07-12 16:04:10",
"2016-07-12 16:04:40", "2016-07-12 16:05:10", "2016-07-12 16:05:40",
"2016-07-12 16:06:02"))

if (!test.sliding.window) {
# define bins for 'test.sliding.window' = FALSE
# remove every second sliding bin but the last one
expected.bins = expected.bins[c(seq(1, length(expected.bins), by = 2), length(expected.bins))]
}

expect_equal(attr(results, "bins"), expected.bins)

}, patrick::cases(
"sliding.windows: FALSE" = list(test.sliding.window = FALSE),
"sliding.windoww: TRUE" = list(test.sliding.window = TRUE)
))


## * * bins ----------------------------------------------------------------

##
Expand Down
50 changes: 33 additions & 17 deletions util-split.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ requireNamespace("lubridate") # for date conversion
#' time-sized windows for all ranges. If set, the \code{time.period} and \code{bins} parameters are ignored;
#' consequently, \code{sliding.window} does not make sense then either.
#' [default: NULL]
#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', 'issues',
#' or an arbitrary combination of them
#' [default: "commits"]
#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach
#' [default: FALSE]
Expand All @@ -65,6 +66,14 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
number.windows = NULL, split.basis = c("commits", "mails", "issues"),
sliding.window = FALSE, project.conf.new = NULL) {

# ensure 'split.basis' defaults to 'commits' if not defined
# and allow it to contain multiple data sources if explicitly wanted
if(!hasArg("split.basis")) {
bockthom marked this conversation as resolved.
Show resolved Hide resolved
split.basis = match.arg.or.default(split.basis, several.ok = FALSE, default = "commits")
} else {
split.basis = match.arg.or.default(split.basis, several.ok = TRUE)
}

# validate existence and type of the 'bins' parameter
if (!is.null(bins) && !lubridate::is.POSIXct(bins)) {
dates = parallel::mclapply(unlist(bins), get.date.from.string)
Expand All @@ -89,7 +98,9 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
#' \code{bins}: Dates defining the start of bins (the last date defines the end of the last bin, in an
#' *exclusive* manner).
#' The expected format of \code{bins} is produced by \code{split.get.bins.activity.based}.
#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
#' @param split.basis the data source that was used to obtain \code{bins} from \code{split.get.bins.activity.based},
#' either 'commits', 'mails', or 'issues'. \code{split.basis} is necessary to associate
#' \code{bins$vector} with the correct data elements.
#' [default: "commits"]
#' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}.
#'
Expand All @@ -99,6 +110,9 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
split.data.by.bins = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"),
sliding.window) {

## get basis for splitting process
split.basis = match.arg(split.basis)
bockthom marked this conversation as resolved.
Show resolved Hide resolved

# validate type of the 'bins' parameter
if (is.null(bins) || !is.list(bins)) {
logging::logerror("The bins parameter needs to be of type list, (is %s)", class(bins))
Expand Down Expand Up @@ -183,7 +197,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
activity.amount = 5000, number.windows = NULL,
sliding.window = FALSE, project.conf.new = NULL) {

## get basis for splitting process
## get activity type for splitting process
activity.type = match.arg(activity.type)

## get actual raw data
Expand All @@ -195,13 +209,13 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
})
names(data) = data.sources

## if the data used by the split basis is not present, load it automatically
## if the data used by the splitting activity type is not present, load it automatically
if (!(activity.type %in% project.data$get.cached.data.sources("only.unfiltered"))) {
function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]
project.data[[function.name]]()
}

## define ID columns for mails and commits
## define ID columns for commits, mails, and issues
id.column = list(
commits = "hash",
mails = "message.id",
Expand Down Expand Up @@ -252,7 +266,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
logging::loginfo("Splitting data '%s' into activity ranges of %s %s (%s windows).",
project.data$get.class.name(), activity.amount, activity.type, number.windows)

## get bins based on 'split.basis'. Here the 'include.duplicate.ids' parameter flag must be set, to
## get bins based on 'activity.type'. Here the 'include.duplicate.ids' parameter flag must be set, to
## retrieve bins which map every event to a bin including events with non-unique ids. This is important
## to ensure that every range really has 'activity.amount' many entries after splitting
logging::logdebug("Getting activity-based bins.")
Expand Down Expand Up @@ -887,8 +901,8 @@ split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, r
#' @param split.by.time logical indicating whether splitting is done time-based or activity-bins-based
#' @param number.windows see \code{number.windows} from \code{split.data.time.based}
#' [default: NULL]
#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', or 'issues'
#' [default: "commits"]
#' @param split.basis either formatted as the \code{split.basis} from \code{split.data.time.based}
#' or from \code{split.data.by.bins}.
#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach
#' [default: FALSE]
#' @param project.conf.new the new project config to construct the \code{RangeData} objects.
Expand All @@ -900,16 +914,16 @@ split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, r
#' @seealso split.data.time.based
#' @seealso split.data.by.bins
split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time,
number.windows = NULL, split.basis = c("commits", "mails", "issues"),
sliding.window = FALSE, project.conf.new = NULL) {

## get basis for splitting process
split.basis = match.arg(split.basis)
number.windows = NULL, split.basis, sliding.window = FALSE,
project.conf.new = NULL) {

## if the data used by the split basis is not present, load it automatically
if (!(split.basis %in% project.data$get.cached.data.sources("only.unfiltered"))) {
function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[split.basis]]
project.data[[function.name]]()
for (i in seq_along(split.basis)) {
data.source = split.basis[i]
if (!(data.source %in% project.data$get.cached.data.sources("only.unfiltered"))) {
function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[data.source]]
project.data[[function.name]]()
}
}

## get actual raw data
Expand Down Expand Up @@ -945,7 +959,9 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli
## if bins are NOT given explicitly
if (is.null(bins)) {
## get bins based on split.basis
bins = split.get.bins.time.based(data[[split.basis]][["date"]], splitting.length, number.windows)$bins
dates = project.data$get.data.timestamps(split.basis)
dates = get.date.from.unix.timestamp(unname(unlist(dates)))
bins = split.get.bins.time.based(dates, splitting.length, number.windows)[["bins"]]
bins.labels = head(bins, -1)
## logging
logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.",
Expand Down