library(data.table)
df <- structure(list(
continuousNumericOne = c(3.82495116149284, 0.915662542284416, 0.751001771620762, NA, NA, 8.07583989184169, 4.57303752008246, 4.02747047825306, 2.79953011697721, 4.28614794390785),
catagoricalFactorOne = structure(c(3L, 3L, 3L, NA, 3L, NA, 2L, 2L, 2L, NA), .Label = c("blue", "green", "red"), class = "factor"),
continuousNumericTwo = c(NA, NA, 2.58285715825289, -2.71316582700148, 3.95645652249594, 1.96862094118233, 4.96960533647993, 6.15199683070215, 3.98091405116921, NA),
catagoricalFactorTwo = structure(c(3L, 3L, 3L, NA, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("blue", "orange", "red"), class = "factor"),
continuousNumericThree = c(3.43332616062442, 2.21448227693603, 2.31889349781533, NA, NA, 3.57539465909581, 3.28076535012702, NA, 3.15063300766727, 2.9556632429251),
continuousNumericFour = c(7.77131807052585, NA, 6.5830522592014, NA, 7.36003333388333, 8.25217350122047, 7.18282902739316, 8.60641407074177, 4.87689328481095, NA)),
.Names = c("continuousNumericOne", "catagoricalFactorOne", "continuousFactorTwo", "catagoricalFactorTwo", "continuousNumericThree", "continuousNumericFour"),
row.names = c(NA, -10L),
class = c("data.table", "data.frame"))
> df
continuousNumericOne catagoricalFactorOne continuousFactorTwo catagoricalFactorTwo continuousNumericThree continuousNumericFour
1: 3.8249512 red NA red 3.433326 7.771318
2: 0.9156625 red NA red 2.214482 NA
3: 0.7510018 red 2.582857 red 2.318893 6.583052
4: NA NA -2.713166 NA NA NA
5: NA red 3.956457 red NA 7.360033
6: 8.0758399 NA 1.968621 red 3.575395 8.252174
7: 4.5730375 green 4.969605 orange 3.280765 7.182829
8: 4.0274705 green 6.151997 orange NA 8.606414
9: 2.7995301 green 3.980914 orange 3.150633 4.876893
10: 4.2861479 NA NA blue 2.955663 NA
如何制作一个自定义函数来处理如下数据...
如果该列是分类(因子),则将所有 NA 替换为“空白”
如果列是连续的(数字),进一步处理数据的额外灵 active ,例如,首先将数据从 0 缩放到 1,然后根据需要替换 NA,可能替换为 -1.1。
我花了很多时间列 list , 试图跟踪列名以及给定的列名是否是因素, 尝试通过 apply 方法应用不同的功能,仍然没有运气。
如果有更好的方法,我会洗耳恭听。
最佳答案
我们可以创建一个函数
f1 <- function(dat){
iCat <- which(sapply(dat, is.factor))
iNum <- which(sapply(dat, is.numeric))
dat[, (iCat) := lapply(.SD, function(x) {
levels(x) <- c(levels(x), "")
x[is.na(x)] <- ""
x}), .SDcols = iCat]
dat[, (iNum) := lapply(.SD, function(x) {
x1 <- as.vector(scale(x))
x1[is.na(x1)] <- -1.1
x1}), .SDcols = iNum][]
}
f1(df)
#continuousNumericOne catagoricalFactorOne continuousFactorTwo
# 1: 0.07257304 red -1.1000000
# 2: -1.18235090 red -1.1000000
# 3: -1.25337745 red -0.1400258
# 4: -1.10000000 -1.9826003
# 5: -1.10000000 red 0.3378723
# 6: 1.90619723 -0.3537288
# 7: 0.39526068 green 0.6903636
# 8: 0.15992990 green 1.1017373
# 9: -0.36974314 green 0.3463815
#10: 0.27151063 -1.1000000
# catagoricalFactorTwo continuousNumericThree continuousNumericFour
# 1: red 0.83246346 0.43436598
# 2: red -1.45562130 -1.10000000
# 3: red -1.25961447 -0.52487557
# 4: -1.10000000 -1.10000000
# 5: red -1.10000000 0.10235154
# 6: red 1.09916272 0.82254218
# 7: orange 0.54606741 -0.04069872
# 8: orange -1.10000000 1.10850704
# 9: orange 0.30177540 -1.90219245
#10: blue -0.06423321 -1.10000000
关于R 使用自定义函数按列处理数据的最佳方式,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45640233/