我在 R 中有这个数据集,看起来像这样:
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
使用 R 编程语言,我编写了这个循环,用于计算学生下一次考试结果的条件概率(以上一次考试为条件):
library(data.table)
setDT(my_data)
my_list = vector("list", length(unique(my_data$id)))
# Create an empty vector with pre-specified dimensions
my_vector = vector("list", 100)
for (i in 1:length(unique(my_data$id)))
{
tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.table(table(pairs_i))
frame_i[, id := i]
print(frame_i)
my_vector[[i]] = frame_i
}, error = function(e){})
}
final = rbindlist(my_vector)
我现在正在尝试“矢量化”此代码以提高效率。这是我的尝试:
# I don't think I need to create a "list or vector" to store the results in advance?
my_vector = sapply(unique(my_data$id), function(i) {
{tryCatch({
setDT(my_data)
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$i = i
print(frame_i)
return(frame_i)
}, error = function(e){print(paste("An error occurred:", e))})
}
})
# produced an error, so I tried a different code
final = rbindlist(my_vector, fill = TRUE)
# not sure if this fully worked either?
final = do.call(rbind.data.frame, my_vector)
我是否正确“矢量化”了这段代码?
更新:概率计算模板:
library(dplyr)
total_1 = final %>% group_by(first, second) %>% summarise(totals = n())
total_2 = total_1 %>% group_by(first) %>% summarise(sum = sum(totals))
join = merge(x = total_1, y = total_2, by = c("first"), all = TRUE)
join$probs = join$totals/join$sum
na.omit(join)
最佳答案
library(magrittr)
library(dplyr)
group_by(my_data, id)%>%
summarise(first = head(results, -1), second = tail(results, -1))%>%
ungroup
关于r - R 中的向量化函数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/74792478/