dataframe - Julia:DataFramesMeta 转换

标签 dataframe julia

我正在尝试在 Julia 中重现以下 R 代码

library(dplyr)

women_new <- rbind(women, c(NA, 1), c(NA, NA))
women_new %>% 
  filter(height %>% complete.cases) %>%
  mutate(sector = character(n()),
         sector = replace(sector, height >= 0 & height <= 60, "1"),
         sector = replace(sector, height >= 61 & height <= 67, "2"), 
         sector = replace(sector, height >= 68 & height <= 72, "3"))

我在 Julia 中的尝试如下:
using DataFrames
using DataFramesMeta
using Lazy
using RDatasets

women = @> begin
  "datasets" 
  dataset("women")
  DataArray()
  vcat([[NA NA]; [NA NA]])
end

women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);
women_new[16, 2] = 1;

我在这里的第一个问题是,有没有办法输入 1立即在 vcat([[NA 1]; [NA NA]])就像在 R 中一样?如果我这样做,它会返回以下错误:
MethodError: Cannot `convert` an object of type DataArrays.NAtype to an object of type Int64
This may have arisen from a call to the constructor Int64(...),
since type constructors fall back to convert methods.
 in macro expansion at multidimensional.jl:431 [inlined]
 in macro expansion at cartesian.jl:64 [inlined]
 in macro expansion at multidimensional.jl:429 [inlined]
 in _unsafe_batchsetindex!(::Array{Int64,2}, ::Base.Repeated{DataArrays.NAtype}, ::UnitRange{Int64}, ::UnitRange{Int64}) at multidimensional.jl:421
 in setindex!(::Array{Int64,2}, ::DataArrays.NAtype, ::UnitRange{Int64}, ::UnitRange{Int64}) at abstractarray.jl:832
 in cat_t(::Int64, ::Type{T}, ::DataArrays.NAtype, ::Vararg{Any,N}) at abstractarray.jl:1098
 in hcat(::DataArrays.NAtype, ::Int64) at abstractarray.jl:1180
 in include_string(::String, ::String) at loading.jl:441
 in include_string(::String, ::String, ::Int64) at eval.jl:30
 in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
 in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
 in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
 in withpath(::Function, ::String) at eval.jl:38
 in macro expansion at eval.jl:49 [inlined]
 in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60

我的第二个问题是,有没有办法转换 DataArrayDataFrame ?在这种情况下,列名变为 X1 , X2 , ...DataFrame 中的任何默认名称自 DataArray没有列名。我认为它比输入以下内容更整洁:
women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);

我希望我可以简单地做 convert(DataFrame, women)并简单地重命名列名。但这种转换不起作用。以下是我在 R 的情况下对转换或变异的尝试。
@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Sector = NA,
    Sector = ifelse(:Height .>=  0 & :Height .<= 60, 1,
             ifelse(:Height .>= 61 & :Height .<= 67, 2,
             ifelse(:Height .>= 68 & :Height .<= 72, 3, NA)))
    )
end

但这会返回:
15×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Sector│
├─────┼────────┼────────┼───────┤
│ 1   │ 58     │ 115    │ 1     │
│ 2   │ 59     │ 117    │ 1     │
│ 3   │ 60     │ 120    │ 1     │
│ 4   │ 61     │ 123    │ 1     │
│ 5   │ 62     │ 126    │ 1     │
│ 6   │ 63     │ 129    │ 1     │
│ 7   │ 64     │ 132    │ 1     │
│ 8   │ 65     │ 135    │ 1     │
│ 9   │ 66     │ 139    │ 1     │
│ 10  │ 67     │ 142    │ 1     │
│ 11  │ 68     │ 146    │ 1     │
│ 12  │ 69     │ 150    │ 1     │
│ 13  │ 70     │ 154    │ 1     │
│ 14  │ 71     │ 159    │ 1     │
│ 15  │ 72     │ 164    │ 1     │

这不等同于 R,我还尝试了以下内容:
@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Sector = NA,
    Sector = :Height .>=  0 & :Height .<= 60 ? 1 :
             :Height .>= 61 & :Height .<= 67 ? 2 :
             :Height .>= 68 & :Height .<= 72 ? 3 :
            NA;
    )
end

但返回以下错误:
TypeError: non-boolean (DataArrays.DataArray{Bool,1}) used in boolean context
 in (::###469#303)(::DataArrays.DataArray{Int64,1}) at DataFramesMeta.jl:55
 in (::##298#302)(::DataFrames.DataFrame) at DataFramesMeta.jl:295
 in #transform#38(::Array{Any,1}, ::Function, ::DataFrames.DataFrame) at DataFramesMeta.jl:270
 in (::DataFramesMeta.#kw##transform)(::Array{Any,1}, ::DataFramesMeta.#transform, ::DataFrames.DataFrame) at <missing>:0
 in include_string(::String, ::String) at loading.jl:441
 in include_string(::String, ::String, ::Int64) at eval.jl:30
 in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
 in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
 in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
 in withpath(::Function, ::String) at eval.jl:38
 in macro expansion at eval.jl:49 [inlined]
 in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60

如果您能帮我解决这个问题,我将不胜感激。最后,我的最后一个问题是,有没有办法像在 R 中那样缩短我的代码但仍然优雅?

最佳答案

我知道了。对运算符优先级有影响,我认为不需要括号。

using DataFrames
using DataFramesMeta
using Lazy
using RDatasets

women = dataset("datasets", "women");
women_new = vcat(
              women,
              DataFrame(Height = [NA; NA], Weight = @data [1; NA])
            )

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Class = NA,
    Class = ifelse((:Height .>=  0) & (:Height .<= 60), 1,
            ifelse((:Height .>= 61) & (:Height .<= 67), 2,
            ifelse((:Height .>= 68) & (:Height .<= 72), 3, NA)))
            )
end

更新:上面的代码可以进一步简化为:
@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Class = @> begin
      function (x)
         0 <= x <= 60 ?  1 :
        61 <= x <= 67 ?  2 :
        68 <= x <= 72 ?  3 :
        NA
      end
      map(:Height)
    end
  )
end

或者另一种方法是使用 Query.jl如下:
using DataFrames
using Query
using RDatasets

women = dataset("datasets", "women");
women_new = vcat(
              women,
              DataFrame(Height = [NA; NA], Weight = @data [1; NA])
            )

@from i in women_new begin
    @where !isnull(i.Height)
    @select {
        i.Height, i.Weight,
        class = 0 <= i.Height <= 60 ?  1 :
               61 <= i.Height <= 67 ?  2 :
               68 <= i.Height <= 72 ?  3 :
                0
    }
    @collect DataFrame
end

输出现在是正确的:
15×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Class │
├─────┼────────┼────────┼───────┤
│ 1   │ 58     │ 115    │ 1     │
│ 2   │ 59     │ 117    │ 1     │
│ 3   │ 60     │ 120    │ 1     │
│ 4   │ 61     │ 123    │ 2     │
│ 5   │ 62     │ 126    │ 2     │
│ 6   │ 63     │ 129    │ 2     │
│ 7   │ 64     │ 132    │ 2     │
│ 8   │ 65     │ 135    │ 2     │
│ 9   │ 66     │ 139    │ 2     │
│ 10  │ 67     │ 142    │ 2     │
│ 11  │ 68     │ 146    │ 3     │
│ 12  │ 69     │ 150    │ 3     │
│ 13  │ 70     │ 154    │ 3     │
│ 14  │ 71     │ 159    │ 3     │
│ 15  │ 72     │ 164    │ 3     │

如果我们不想过滤 NA 并使用完整的数据,那么我能做的最好的事情如下:
@> begin
  women_new
  @transform(
    Height_New = NA,
    Height_New = ifelse(isna(:Height), -1, :Height))
  @transform(
    Class = NA,
    Class = ifelse(:Height_New == -1, NA,
              ifelse((:Height_New .>=  0) & (:Height_New .<= 60), 1,
              ifelse((:Height_New .>= 61) & (:Height_New .<= 67), 2,
              ifelse((:Height_New .>= 68) & (:Height_New .<= 72), 3, NA))))
  )
  delete!(:Height_New)
end

更新:上面的代码可以进一步简化为:
@> begin
    women_new
    @transform(
        Class = @> begin
            function (x)
                isna(x)       ? NA :
                 0 <= x <= 60 ?  1 :
                61 <= x <= 67 ?  2 :
                68 <= x <= 72 ?  3 :
                NA
            end
            map(:Height)
        end
    )
end

或者另一种方法是使用 Query.jl如下:
@from i in women_new begin
    @select {
        i.Height, i.Weight,
        class = 0 <= i.Height <= 60 ?  1 :
               61 <= i.Height <= 67 ?  2 :
               68 <= i.Height <= 72 ?  3 :
                0
    }
    @collect DataFrame
end

输出:
17×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Class │
├─────┼────────┼────────┼───────┤
│ 1   │ 58     │ 115    │ 1     │
│ 2   │ 59     │ 117    │ 1     │
│ 3   │ 60     │ 120    │ 1     │
│ 4   │ 61     │ 123    │ 2     │
│ 5   │ 62     │ 126    │ 2     │
│ 6   │ 63     │ 129    │ 2     │
│ 7   │ 64     │ 132    │ 2     │
│ 8   │ 65     │ 135    │ 2     │
│ 9   │ 66     │ 139    │ 2     │
│ 10  │ 67     │ 142    │ 2     │
│ 11  │ 68     │ 146    │ 3     │
│ 12  │ 69     │ 150    │ 3     │
│ 13  │ 70     │ 154    │ 3     │
│ 14  │ 71     │ 159    │ 3     │
│ 15  │ 72     │ 164    │ 3     │
│ 16  │ NA     │ 1      │ NA    │
│ 17  │ NA     │ NA     │ NA    │

在这种情况下,代码变得困惑,因为在 ifelse 中还没有处理 NA 的方法。的第一个论点。

关于dataframe - Julia:DataFramesMeta 转换,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43691789/

相关文章:

dataframe - 如何在 Julia 中对 DateTime 或 Time 类型进行取模?

Julia 语言 : Redirecting stdout does not affect every println//How to extract value from stdout

python - 重新索引数据帧

python - pandas.Int64Index 修复 FutureWarning

python - 合并 Pandas 数据框后获取百分比

python - 根据条件计算组内共享元素的数量

r - 如何根据唯一项目 ID 对列中的项目值求和?

julia - 极其稀疏整数二次规划

git - Julia 证书错误

linux - 是否可以更改远程服务器(Linux RedHat 4.4.7-17)上的默认浏览器(lynx)? (使用 Gadfly 在 Julia 中绘图)