FluxML · lorenzoh · Aug 10, 2021 · Jun 6, 2021 · Jun 8, 2021 · Jun 18, 2021
diff --git a/src/DataAugmentation.jl b/src/DataAugmentation.jl
@@ -28,6 +28,7 @@ include("./sequence.jl")
 include("./items/arrayitem.jl")
 include("./projective/base.jl")
 include("./items/image.jl")
+include("./items/table.jl")
 include("./items/keypoints.jl")
 include("./items/mask.jl")
 include("./projective/compose.jl")
@@ -36,6 +37,7 @@ include("./projective/affine.jl")
 include("./projective/warp.jl")
 include("./oneof.jl")
 include("./preprocessing.jl")
+include("./rowtransforms.jl")
 include("./colortransforms.jl")
 include("testing.jl")
 include("./visualization.jl")
@@ -49,6 +51,7 @@ export Item,
     Sequence,
     Project,
     Image,
+    TabularItem,
     Keypoints,
     Polygon,
     ToEltype,
@@ -88,7 +91,8 @@ export Item,
     onehot,
     showitems,
     showgrid,
-    Bounds
+    Bounds,
+    getcategorypools
 
 
 end # module
diff --git a/src/items/table.jl b/src/items/table.jl
@@ -0,0 +1,4 @@
+struct TabularItem{T} <: Item
+    data::T
+    columns
+end
diff --git a/src/rowtransforms.jl b/src/rowtransforms.jl
@@ -0,0 +1,125 @@
+"""
+    NormalizeRow(dict, cols)
+
+Normalizes the values of a row present in `TabularItem` for the columns 
+specified in `cols` using `dict`, which contains the column names as 
+dictionary keys and the mean and standard deviation tuple present as 
+dictionary values.
+
+## Example
+
+```julia
+using DataAugmentation
+
+cols = [:col1, :col2, :col3]
+row = (; zip(cols, [1, 2, 3])...)
+item = TabularItem(row, cols)
+normdict = Dict(:col1 => (1, 1), :col2 => (2, 2))
+
+tfm = NormalizeRow(normdict, [:col1, :col2])
+apply(tfm, item)
+```
+"""
+struct NormalizeRow{T, S} <: Transform
+    dict::T
+    cols::S
+end
+
+function apply(tfm::NormalizeRow, item::TabularItem; randstate=nothing)
+    x = NamedTuple(Iterators.map(item.columns, item.data) do col, val
+        if col in tfm.cols
+            colmean, colstd = tfm.dict[col]
+            val = (val - colmean)/colstd
+        end
+        (col, val)
+    end)
-    end)
+    end)
+    
+    return TabularItem(x, item.columns)
-    end)
+    end)
+    
+    return TabularItem(x, item.columns)
+    TabularItem(x, item.columns)
+end
+
+"""
+    FillMissing(dict, cols)
+
+Fills the missing values of a row present in `TabularItem` for the columns 
+specified in `cols` using `dict`, which contains the column names as 
+dictionary keys and the value to fill the column with present as 
+dictionary values.
+
+## Example
+
+```julia
+using DataAugmentation
+
+cols = [:col1, :col2, :col3]
+row = (; zip(cols, [1, 2, 3])...)
+item = TabularItem(row, cols)
+fmdict = Dict(:col1 => 100, :col2 => 100)
+
+tfm = FillMissing(fmdict, [:col1, :col2])
+apply(tfm, item)
+```
+"""
+struct FillMissing{T, S} <: Transform
+    dict::T
+    cols::S
+end
+
+function apply(tfm::FillMissing, item::TabularItem; randstate=nothing)
+    x = NamedTuple(Iterators.map(item.columns, item.data) do col, val
+        if col in tfm.cols && ismissing(val)
+            val = tfm.dict[col]
+        end
+        (col, val)
+    end)
+    TabularItem(x, item.columns)
+end
+
+"""
+    Categorify(dict, cols)
+
+Label encodes the values of a row present in `TabularItem` for the 
+columns specified in `cols` using `dict`, which contains the column 
+names as dictionary keys and the unique values of column present 
+as dictionary values.
+
+if there are any `missing` values in the values to be transformed, 
+they are replaced by 1.
+
+## Example
+
+```julia
+using DataAugmentation
+
+cols = [:col1, :col2, :col3]
+row = (; zip(cols, ["cat", 2, 3])...)
+item = TabularItem(row, cols)
+catdict = Dict(:col1 => ["dog", "cat"])
+
+tfm = Categorify(catdict, [:col1])
+apply(tfm, item)
+```
+"""
+struct Categorify{T, S} <: Transform
+    dict::T
+    cols::S
+    function Categorify{T, S}(dict::T, cols::S) where {T, S}
+        for (col, vals) in dict
+            if any(ismissing, vals)
+                dict[col] = filter(!ismissing, vals)
+                @warn "There is a missing value present for category '$col' which will be removed from Categorify dict"
+            end
+        end
+        new{T, S}(dict, cols)
+    end
+end
+
+Categorify(dict::T, cols::S) where {T, S} = Categorify{T, S}(dict, cols)
+
+function apply(tfm::Categorify, item::TabularItem; randstate=nothing)
+    x = NamedTuple(Iterators.map(item.columns, item.data) do col, val
+        if col in tfm.cols
+            val = ismissing(val) ? 1 : findfirst(val .== tfm.dict[col]) + 1
+        end
+        (col, val)
+    end)
+    TabularItem(x, item.columns)
+end
diff --git a/test/imports.jl b/test/imports.jl
@@ -9,6 +9,7 @@ using CoordinateTransformations
 using DataAugmentation: Item, Transform, getrandstate, itemdata, setdata, ComposedProjectiveTransform,
     projectionbounds, getprojection, offsetcropbounds,
     CroppedProjectiveTransform, getbounds, project, project!, makebuffer, imagetotensor, imagetotensor!,
-    normalize, normalize!, tensortoimage, denormalize, denormalize!
+    normalize, normalize!, tensortoimage, denormalize, denormalize!,
+    NormalizeRow, FillMissing, Categorify, TabularItem
 using DataAugmentation: testitem, testapply, testapply!, testprojective
 import DataAugmentation: apply, compose
diff --git a/test/rowtransforms.jl b/test/rowtransforms.jl
@@ -0,0 +1,56 @@
+include("imports.jl")
+
+@testset ExtendedTestSet "`NormalizeRow`" begin
+    cols = [:col1, :col2, :col3]
+    item = TabularItem((; zip(cols, [1, "a", 10])...), cols)
+    cols_to_normalize = [:col1, :col3]
+    col1_mean, col1_std = 10, 100
+    col3_mean, col3_std = 100, 10
+    normdict = Dict(:col1 => (col1_mean, col1_std), :col3 => (col3_mean, col3_std))
+
+    tfm = NormalizeRow(normdict, cols_to_normalize)
+    # @test_nowarn apply(tfm, item)
+    testapply(tfm, item)
+    titem = apply(tfm, item)
+    @test titem.data[:col1] == (item.data[:col1] - col1_mean)/col1_std
+    @test titem.data[:col3] == (item.data[:col3] - col3_mean)/col3_std
+end
+
+@testset ExtendedTestSet "`FillMissing`" begin
+    cols = [:col1, :col2, :col3]
+    item = TabularItem((; zip(cols, [1, missing, missing])...), cols)
+    cols_to_fill = [:col1, :col3]
+    col1_fmval = 1000.
+    col3_fmval = 1000.
+    fmdict = Dict()
+    fmdict[:col1], fmdict[:col3] = col1_fmval, col3_fmval
+
+    tfm1 = FillMissing(fmdict, cols_to_fill)
+    @test_nowarn apply(tfm1, item)
+    titem = apply(tfm1, item)
+    @test titem.data[:col1] == (ismissing(item.data[:col1]) ? col1_fmval : item.data[:col1])
+    @test titem.data[:col3] == (ismissing(item.data[:col3]) ? col3_fmval : item.data[:col3])
+    @test ismissing(titem.data[:col2])
+
+    fmdict[:col2] = "d"
+    tfm2 = FillMissing(fmdict, [:col1, :col2, :col3])
+    testapply(tfm2, item)
+    titem2 = apply(tfm2, item)
+    @test titem2.data[:col2] == (ismissing(item.data[:col2]) ? "d" : item.data[:col2])
+end
+
+@testset ExtendedTestSet "`Categorify`" begin
+    cols = [:col1, :col2, :col3, :col4]
+    item = TabularItem((; zip(cols, [1, "a", "A", missing])...), cols)
+    cols_to_categorify = [:col2, :col3, :col4]
+
+    categorydict = Dict(:col2 => ["a", "b", "c"], :col3 => ["C", "B", "A"], :col4 => [missing, 10, 20])
+    tfm = Categorify(categorydict, cols_to_categorify)
+    @test !any(ismissing.(tfm.dict[:col4]))
+    @test_nowarn apply(tfm, item)
+    testapply(tfm, item)
+    titem = apply(tfm, item)
+    @test titem.data[:col2] == 2
+    @test titem.data[:col3] == 4
+    @test titem.data[:col4] == 1
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -44,4 +44,7 @@ include("./imports.jl")
     @testset ExtendedTestSet "visualization.jl" begin
         include("visualization.jl")
     end
+    @testset ExtendedTestSet "rowtransforms.jl" begin
+        include("rowtransforms.jl")
+    end
 end