LCSB-BioCore · laurentheirendt · Jan 24, 2020 · Jan 13, 2020 · Jan 13, 2020 · Jan 13, 2020
diff --git a/docs/tutorials/example_workflow.jl b/docs/tutorials/example_workflow.jl
@@ -21,19 +21,16 @@ nWorkers = 2
 addprocs(nWorkers, topology=:master_worker)
 @everywhere using GigaSOM
 
-generateIO(dataPath, md, nWorkers, true, 1, true)
-
-R =  Vector{Any}(undef,nWorkers)
-
-@time @sync for (idx, pid) in enumerate(workers())
-    @async R[idx] = fetch(@spawnat pid loadData(idx, "input-$idx.jls", md, panel))
-end
+# R: Array of reference to each data files per worker
+# use '_' or just "R, " to ignore the second return value
+# second return value is used later for indexing the data files
+R, _ = loadData(dataPath, md, nWorkers, panel=panel, reduce=true, transform=true)
 
 som = initGigaSOM(R, 10, 10)
 
 cc = map(Symbol, vcat(lineageMarkers, functionalMarkers))
 
-@time som = trainGigaSOM(som, R, cc)
+@time som = trainGigaSOM(som, R)
 
 winners = mapToGigaSOM(som, R)
 

diff --git a/src/core.jl b/src/core.jl
@@ -125,7 +125,7 @@ end
 - `radiusFun`: Function that generates radius decay, e.g. `linearRadius` or `expRadius(10.0)`
 - `epochs`: number of SOM training iterations (default 10)
 """
-function trainGigaSOM(som::Som, trainRef::Array{Any,1}, cc;
+function trainGigaSOM(som::Som, trainRef::Array{Any,1};
                       kernelFun::Function = gaussianKernel,
                       metric = Euclidean(),
                       knnTreeFun = BruteTree,
@@ -155,7 +155,7 @@ function trainGigaSOM(som::Som, trainRef::Array{Any,1}, cc;
             for (idx, pid) in enumerate(workers())
                 @async begin
                     # @info pid
-                    R[idx] =  fetch(@spawnat pid begin doEpoch(trainRef[idx], codes, tree, cc) end)
+                    R[idx] =  fetch(@spawnat pid begin doEpoch(trainRef[idx], codes, tree) end)
                     globalSumNumerator += R[idx][1]
                     globalSumDenominator += R[idx][2]
                 end
@@ -275,7 +275,7 @@ vectors and the adjustment in radius after each epoch.
 - `codes`: Codebook
 - `tree`: knn-compatible tree built upon the codes
 """
-function doEpoch(x::Ref, codes::Array{Float64, 2}, tree, cc)
+function doEpoch(x::Ref, codes::Array{Float64, 2}, tree)
 
     # initialise numerator and denominator with 0's
     sumNumerator = zeros(Float64, size(codes))

diff --git a/src/io/input.jl b/src/io/input.jl
@@ -69,40 +69,99 @@ function readFlowFrame(filename::String)
 end
 
 """
-    loadData(idx, fn, md,panel; method = "asinh", cofactor = 5,
-            reduce = true, sort = true)
+    loadData(dataPath, data, nWorkers; panel=Nothing(),
+            type = "fcs", method = "asinh", cofactor = 5,
+            reduce = false, sort = false, transform = false)
+
+This function is of 2 parts. Part 1: Generates the temporary binaray files to be loaded by the
+    workers. The Input data will be equally divided into n parts according to the number of workers.
+    Part2: each worker loads independently its own data-package in parallel and returns
+
+# Arguments:
+- `dataPath`: path to data folder
+- `data`: single filename::String or a metadata::DataFrame with a column sample_name
+- `panel`: Panel table with a column for Lineage Markers and one for Functional Markers,
+    or Array::{Int} used as column indices, default: Nothing()
+- `type`: String, type of datafile, default FCS
+- `method`: transformation method, default arcsinh, optional
+- `cofactor`: Cofactor for transformation, default 5, optional
+- `reduce`: Selected only columns which are defined by lineage and functional, optional,
+    default: false. If false the check for any none columns to be removed (none columns can appear
+    after concatenating FCS files as well as parameter like: time, event length)
+- `sort`: Sort columns by name to make sure the order when concatinating the dataframes, optional, default: false
+- `transform`: Boolean to indicate if the data will be transformed according to method, default: false
+"""
+function loadData(dataPath, data, nWorkers; panel=Nothing(),
+                type = "fcs", method = "asinh", cofactor = 5,
+                reduce = false, sort = false, transform = false)
+
+    xRange = generateIO(dataPath, data, nWorkers, true, 1, true)
+
+    R =  Vector{Any}(undef,nWorkers)
+
+    # Load the data by each worker
+    # Without panel file, all columns are loaded:
+    # loadData(idx, "input-$idx.jls")
+    # Columns ca be selected by an array of indicies:
+    # loadData(idx, "input-$idx.jls", [3:6;9:11]) <- this will concatenate ranges into arrays
+    # Please note that all optional arguments are by default "false"
+    if type == "fcs"
+        @sync for (idx, pid) in enumerate(workers())
+            @async R[idx] = fetch(@spawnat pid loadDataFile(idx, "input-$idx.jls", panel, method,
+                                cofactor,reduce, sort, transform))
+        end
+    else
+        @error "File Type not yet supported!"
+    end
+
+    return R, xRange
+
+end
+
+"""
+    loadDataFile(idx, fn, panel, method, cofactor, reduce, sort, transform)
 
 Load the data in parallel on each worker. Returns a reference of the loaded Data
 
 # Arguments:
 - `idx`: worker index
 - `fn`: filename
-- `md`: Metadata table
-- `panel`: Panel table with a column for Lineage Markers and one for Functional Markers
+- `panel`: Panel table with a column for Lineage Markers and one for Functional Markers,
+    or Array::{Int} used as column indicies
 - `method`: transformation method, default arcsinh, optional
 - `cofactor`: Cofactor for transformation, default 5, optional
 - `reduce`: Selected only columns which are defined by lineage and functional, optional,
     default: true. If false the check for any none columns to be removed (none columns can appear
     after concatenating FCS files as well as parameter like: time, event length)
 - `sort`: Sort columns by name to make sure the order when concatinating the dataframes, optional, default: true
+- `transform`: Boolean to indicate if the data will be transformed according to method
 """
-function loadData(idx, fn, md, panel; method = "asinh", cofactor = 5,
-                            reduce = true, sort = true)
+function loadDataFile(idx, fn, panel, method, cofactor, reduce, sort, transform)
 
     y = open(deserialize, fn)
-    fcsRaw = y[idx]
-    cleanNames!(fcsRaw)
-
-    # extract lineage markers
-    lineageMarkers, functionalMarkers = getMarkers(panel)
-
-    cc = map(Symbol, vcat(lineageMarkers, functionalMarkers))
-    # markers can be lineage and functional at tthe same time
-    # therefore make cc unique
-    unique!(cc)
+    fcsData = y[idx]
+    cleanNames!(fcsData)
+
+    # Define the clustering column by range object
+    if typeof(panel) == Array{Int64,1}
+        cc = panel
+    elseif typeof(panel) == DataFrame
+        # extract lineage markers
+        lineageMarkers, functionalMarkers = getMarkers(panel)
+        cc = map(Symbol, vcat(lineageMarkers, functionalMarkers))
+        # markers can be lineage and functional at tthe same time
+        # therefore make cc unique
+        unique!(cc)
+    else
+        # If no panel is provided, use all column names as cc
+        # and set reduce to false
+        cc = map(Symbol, names(fcsData))
+    end
 
-    fcsData = transformData(fcsRaw, method, cofactor)
-    fcsData = sortReduce(fcsData, cc, reduce, sort)
+    if transform
+        fcsData = transformData(fcsData, method, cofactor)
+    end
+    sortReduce(fcsData, cc, reduce, sort)
 
     # get the sample_id from md
     # return value is an array with only one entry -> take [1]
@@ -118,3 +177,4 @@ function loadData(idx, fn, md, panel; method = "asinh", cofactor = 5,
 
     return (dfallRef)
 end
+
diff --git a/src/satellites.jl b/src/satellites.jl
@@ -184,11 +184,15 @@ and at the given location.
 - `inSize`: Vector with the lengths of each file within the input data set
 - `runSum`: Running sum of the `inSize` vector (`runSum[end] == totalSize`)
 """
-function getTotalSize(loc, md, printLevel=0)
+function getTotalSize(loc, md::Any, printLevel=0)
     global totalSize, tmpSum
 
-    # define the file names
-    fileNames = sort(md.file_name)
+    if md == typeof(String)
+        filenames = [md]
+    else
+        # define the file names
+        fileNames = sort(md.file_name)
+    end
 
     # out the number of files
     if printLevel > 0
@@ -231,6 +235,7 @@ function getTotalSize(loc, md, printLevel=0)
     return totalSize, inSize, runSum
 end
 
+
 """
     splitting(totalSize, nWorkers, printLevel=0)
 
@@ -250,7 +255,10 @@ given the total size and the number of workers
 """
 function splitting(totalSize, nWorkers, printLevel=0)
     # determine the size per file
-    fileL = Int(floor(totalSize/nWorkers))
+    fileL = div(totalSize, nWorkers)
+
+    # determine the remainder
+    extras = rem(totalSize,nWorkers)
 
     # determine the size of the last (residual) file
     lastFileL = Int(fileL + totalSize - nWorkers * fileL)
@@ -262,7 +270,21 @@ function splitting(totalSize, nWorkers, printLevel=0)
         @info " > Total row count: $totalSize cells"
     end
 
-    return fileL, lastFileL
+    # determine the ranges
+    nchunks = fileL > 0 ? nWorkers : extras
+    chunks = Vector{UnitRange{Int}}(undef, nchunks)
+    lo = 1
+    for i in 1:nchunks
+        hi = lo + fileL - 1
+        if extras > 0
+            hi += 1
+            extras -= 1
+        end
+        chunks[i] = lo:hi
+        lo = hi+1
+    end
+
+    return fileL, lastFileL, chunks
 end
 
 """
@@ -449,7 +471,7 @@ Generate binary .jls files given a path to files, their metadata, and the number
 # INPUTS
 
 - `filePath`: path to the files
-- `md`: Metadata table
+- `md`: Metadata table, or single file String
 - `nWorkers`: number of workers
 - `generateFiles`: Boolean to actually generate files
 - `printLevel`: Verbose level (0: mute)
@@ -465,7 +487,7 @@ if `generateFiles` is `true`:
     `nWorkers` files named `input-<workerID>.jls` saved in the directory `filePath`.
 
 """
-function generateIO(filePath, md, nWorkers, generateFiles=true, printLevel=0, saveIndices=false)
+function generateIO(filePath, md::DataFrame, nWorkers, generateFiles=true, printLevel=0, saveIndices=false)
 
     # determin the total size, the vector with sizes, and their running sum
     totalSize, inSize, runSum = getTotalSize(filePath, md, printLevel)
@@ -506,19 +528,54 @@ function generateIO(filePath, md, nWorkers, generateFiles=true, printLevel=0, sa
         end
 
         # output the file per worker
-        if generateFiles
-            open(f -> serialize(f,out), "input-$worker.jls", "w")
-            if printLevel > 0
-                printstyled("[ Info:  > File input-$worker.jls written.\n", color=:green, bold=true)
-            end
-        end
+        outputFile(out, "input-$worker.jls", generateFiles)
     end
 
     if saveIndices
         return localStartVect, localEndVect
     end
 end
 
+"""
+    generateIO(filePath, fn::String, nWorkers, generateFiles=true, printLevel=0, saveIndices=false)
+
+Generate binary .jls files for a single file given a path and the number of workers
+
+# INPUTS
+
+- `filePath`: path to the files
+- `fn`: file name
+- `nWorkers`: number of workers
+- `generateFiles`: Boolean to actually generate files
+- `printLevel`: Verbose level (0: mute)
+- `saveIndices`: Boolean to save the local indices
+
+# OUTPUTS
+
+if `saveIndices` is `true`:
+    - `chunks`: start index of local file
+
+if `generateFiles` is `true`:
+    - `nWorkers` files named `input-<workerID>.jls` saved in the directory `filePath`.
+
+"""
+function generateIO(filePath, fn::String, nWorkers, generateFiles=true, printLevel=0, saveIndices=false)
+
+    # read the single file and split it according to the number of workers.
+    inFile = readFlowFrame(filePath * Base.Filesystem.path_separator * fn)
+    _, _, chunks = splitting(size(inFile, 1), nWorkers, 0)
+
+    for i in 1:length(chunks)
+        out = Dict()
+        out[i] = inFile[chunks[i], :]
+        outputFile(out, "input-$i.jls", generateFiles)
+    end
+
+    if saveIndices
+        return chunks
+    end
+end
+
 """
     rmFile(fileName, printLevel = 1)
 
@@ -529,7 +586,7 @@ Remove a file.
 - `fileName`: name of file to be removed
 - `printLevel`: Verbose level (0: mute)
 """
-function rmFile(fileName, printLevel = 1)
+function rmFile(fileName, printLevel=0)
     try
         if printLevel > 0
             printstyled("> Removing $fileName ... ", color=:yellow)
@@ -543,4 +600,25 @@ function rmFile(fileName, printLevel = 1)
             printstyled("(file $fileName does not exist - skipping).\n", color=:red)
         end
     end
+end
+
+"""
+    outputFile(out, fileName, generateFiles=true, printLevel=0)
+
+Generate a file given a name and content.
+
+# INPUTS
+
+- `out`: content of the file
+- `fileName`: name of file to be removed
+- `generateFiles`: Boolean to actually generate files
+- `printLevel`: Verbose level (0: mute)
+"""
+function outputFile(out, fileName, generateFiles=true, printLevel=0)
+    if generateFiles
+        open(f -> serialize(f,out), fileName, "w")
+        if printLevel > 0
+            printstyled("[ Info:  > File $fileName written.\n", color=:green, bold=true)
+        end
+    end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -12,6 +12,8 @@ checkDir()
     include("testSatellites.jl")
     include("testSplitting.jl")
     include("testTrainingOuputEquality.jl")
+    include("testSingleFileSplitting.jl")
+    include("testLoadData.jl")
 end
 
 cd(owd)