我有一个一年有 40 天的数据和一些数据
set.seed(123)
df <- data.frame(day = 1:40,rain = runif(40,min = 0, max = 3), petc = runif(40, min = 0.3, max = 8),swc = runif(40, min = 27.01, max = 117.43))
我想为每一天计算另一个名为 aetc 的变量,计算如下:
SW.ini <- 2 # setting some initial values
SW.max <- 5
SW.min <- 0
第 1 天,
1) 确定一个名为PAW(day1) = SW.ini + rain(day1)
的变量
2) 如果PAW(day1) >= SWC(day1), aetc(day1) = petc(day1)
;
If `PAW(day1) < SWC(day1), aetc(day1) = PAW(day1)/SWC(day1) * petc(day1)`
3) 检查是否aetc(day1) > PAW(day1). If yes, aetc(day1) = paw(day1)
4) 更新SW(day1) = SW.ini + rain(day1) - aetc(day1)
5) 如果SW(day1) > SW.max, SW(day1) = SW.max. Similarly if
SW(day1) < SW.min, SW(day1) = SW.min`
第二天重复
1) 确定PAW(day2) = SW(day1) + rain(day2)
2)如果PAW(day2) >= SWC(day2), aetc(day2) = petc(day2)
;
如果PAW(day2) < SWC(day2), aetc(day2) = PAW(day2)/SWC(day2) * petc(day2)
3) 检查是否aetc(day2) > PAW(day2)
.如果是,aetc(day2) = paw(day2)
4) 更新SW(day2) = SW(day1) + rain(day2) - aetc(day2)
5) 如果SW(day2) > SW.max, SW(day2) = SW.max. Similarly if
SW(day2) < SW.min,
SW(day2) = SW.min`
这是我用于执行此操作的优雅 for 循环:
df$PAW <- NA
df$aetc <- NA
df$SW <- NA
df$PAW[1] <- SW.ini + df$rain[1]
df$aetc[1] <- ifelse(df$PAW[1] >= df$swc[1], df$petc[1],(df$PAW[1]/df$swc[1])*df$petc[1])
df$aetc[1] <- ifelse(df$aetc[1] > df$PAW[1], df$PAW[1], df$aetc[1])
df$SW[1] <- SW.ini + df$rain[1] - df$aetc[1]
df$SW[1] <- ifelse(df$SW[1] > SW.max, SW.max, ifelse(df$SW[1] < 0, 0,df$SW[1]))
for (day in 2:nrow(df)){
df$PAW[day] <- df$SW[day - 1] + df$rain[day]
df$aetc[day] <- ifelse(df$PAW[day] >= df$swc[day], df$petc[day], (df$PAW[day]/df$swc[day]) * df$petc[day])
df$aetc[day] <- ifelse(df$aetc[day] > df$PAW[day], df$PAW[day],df$aetc[day])
df$SW[day] <- df$SW[day - 1] + df$rain[day] - df$aetc[day]
df$SW[day] <- ifelse(df$SW[day] > SW.max,SW.max, ifelse(df$SW[day] < 0, 0,df$SW[day]))
}
我的问题是这只是一年的数据,我想运行它多年。
set.seed(123)
df <- data.frame(year = 1980:2015, day = rep(1:40, each = 36),rain =
runif(40*36,min = 0, max = 3), petc = runif(40*36, min = 0.3, max = 8),swc = runif(40*36, min = 27.01, max = 117.43))
所以我想做类似的事情
df %>% group_by(year) # and then run the above function for each year.
是否有 dplyr 或任何其他解决方案?
谢谢
最佳答案
Note: I originally posted this answer on your follow up question, R: for loop within a foreach loop, but after seeing this one, it seems this answer is far more relevant here. (I don't address anything related to parallelizing in my answer, which was the topic of your follow up).
使用Rcpp
和data.table
使用 C++ 编译逻辑并使用 data.table 分组操作按组应用它可以使您的基线速度提高约 2,000 倍,远远超过您希望通过并行化获得的速度。
在你的原始示例中,它有 39,420,000 行,这在我的机器上执行了 1.883 秒;在具有 28,800 行 的修订版上,执行时间为 0.004 秒
library(data.table)
library(Rcpp)
定义并编译一个 C++
函数,CalcSW()
内联在 R 脚本中:
请注意:C
/C++
中的计数从 0
开始,与 R
不同,后者从 1
开始——这就是这里索引不同的原因
Rcpp::cppFunction('
List CalcSW(NumericVector SW_ini,
NumericVector SW_max,
NumericVector rain,
NumericVector swc,
NumericVector PETc) {
int n = SW_ini.length();
NumericVector SW(n);
NumericVector PAW(n);
NumericVector aetc(n);
double SW_ini_glob = SW_ini[0];
double SW_max_glob = SW_max[0];
SW[0] = SW_ini_glob;
PAW[0] = SW[0] + rain[0];
if (PAW[0] > swc[0]){
aetc[0] = PETc[0];
} else {
aetc[0] = PAW[0]/swc[0]*PETc[0];
}
if (aetc[0] > PAW[0]){
aetc[0] = PAW[0];
}
SW[0] = SW[0] + rain[0] - aetc[0];
if(SW[0] > SW_max_glob){
SW[0] = SW_max_glob;
}
if(SW[0] < 0){
SW[0] = 0;
}
for (int i = 1; i < n; i++) {
PAW[i] = SW[i-1] + rain[0];
if (PAW[i] > swc[i]){
aetc[i] = PETc[i];
} else {
aetc[i] = PAW[i]/swc[i]*PETc[i];
}
if (aetc[i] > PAW[i]){
aetc[i] = PAW[i];
}
SW[i] = SW[i-1] + rain[i] - aetc[i];
if(SW[i] > SW_max_glob){
SW[i] = SW_max_glob;
}
if(SW[i] < 0){
SW[i] = 0;
}
}
return Rcpp::List::create(Rcpp::Named("SW") = SW,
Rcpp::Named("PAW") = PAW,
Rcpp::Named("aetc") = aetc);
}')
创建数据表
df <- data.table(loc.id = rep(1:10, each = 80*36),
year = rep(rep(1980:2015, each = 80), times = 10),
day = rep(rep(1:80, times = 36),times = 10),
rain = runif(10*36*80, min = 0 , max = 5),
swc = runif(10*36*80,min = 0, max = 50),
SW_max = rep(runif(10, min = 100, max = 200), each = 80*36),
SW_ini = runif(10*36*80),
PETc = runif(10*36*80, min = 0 , max = 1.3),
SW = as.numeric(NA),
PAW = as.numeric(NA),
aetc = as.numeric(NA))
setkey(df, loc.id, year, day)
对 loc.id
和 year
的每个组合在 df
上执行函数 CalcSW()
,将返回值同时分配给三列:
system.time({
df[, c("SW","PAW","aetc") := CalcSW(SW_ini,
SW_max,
rain,
swc,
PETc), keyby = .(loc.id, year)]
})
...
user system elapsed
0.004 0.000 0.004
结果:
head(df)
...
loc.id year day rain swc SW_max SW_ini PETc SW PAW aetc
1: 1 1980 1 0.35813251 28.360715 177.3943 0.69116310 0.2870478 1.038675 1.049296 0.01062025
2: 1 1980 2 1.10331116 37.013022 177.3943 0.02742273 0.4412420 2.125335 1.396808 0.01665171
3: 1 1980 3 1.76680011 32.509970 177.3943 0.66273062 1.1071233 3.807561 2.483467 0.08457420
4: 1 1980 4 3.20966558 8.252797 177.3943 0.12220454 0.3496968 6.840713 4.165693 0.17651342
5: 1 1980 5 1.32498191 14.784203 177.3943 0.66381497 1.2168838 7.573160 7.198845 0.59253503
6: 1 1980 6 0.02547458 47.903637 177.3943 0.21871598 1.0864713 7.418750 7.931292 0.17988449
我不是 100% 肯定我完美地实现了你的逻辑,但逻辑应该非常简单,可以调整我可能遗漏的地方,我以与你布局的方式非常相似的方式实现它。
另一个注意事项:如果您创建一个单独的文件,名称类似于 TestCode.cpp
,格式如下。
然后,您可以使用 Rcpp::sourceCpp("TestCode.cpp")
在您的 R 脚本中编译您的函数,或者您可以复制并粘贴除前三行以外的所有内容一个字符串作为 Rcpp::cppFunction()
的参数,就像我上面做的那样。
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
List CalcSW(NumericVector SW_ini,
NumericVector SW_max,
NumericVector rain,
NumericVector swc,
NumericVector PETc) {
int n = SW_ini.length();
NumericVector SW(n);
NumericVector PAW(n);
NumericVector aetc(n);
double SW_ini_glob = SW_ini[0];
double SW_max_glob = SW_max[0];
SW[0] = SW_ini_glob;
PAW[0] = SW[0] + rain[0];
if (PAW[0] > swc[0]){
aetc[0] = PETc[0];
} else {
aetc[0] = PAW[0]/swc[0]*PETc[0];
}
if (aetc[0] > PAW[0]){
aetc[0] = PAW[0];
}
SW[0] = SW[0] + rain[0] - aetc[0];
if(SW[0] > SW_max_glob){
SW[0] = SW_max_glob;
}
if(SW[0] < 0){
SW[0] = 0;
}
for (int i = 1; i < n; i++) {
PAW[i] = SW[i-1] + rain[0];
if (PAW[i] > swc[i]){
aetc[i] = PETc[i];
} else {
aetc[i] = PAW[i]/swc[i]*PETc[i];
}
if (aetc[i] > PAW[i]){
aetc[i] = PAW[i];
}
SW[i] = SW[i-1] + rain[i] - aetc[i];
if(SW[i] > SW_max_glob){
SW[i] = SW_max_glob;
}
if(SW[i] < 0){
SW[i] = 0;
}
}
return Rcpp::List::create(Rcpp::Named("SW") = SW,
Rcpp::Named("PAW") = PAW,
Rcpp::Named("aetc") = aetc);
}
关于R:设置初始条件的for循环的dplyr解决方案,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49118364/