我试图使用left_join和nest来计算被绑定数据的平均值。
bin.size = 100 第一次数据访问:
df = data.frame(x =c(300,400),
y = c("sca1","sca2"))
x y
1 300 sca1
2 400 sca2第二次数据访问:
df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399),
sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
snp r2 sca
1 1 0.70 sca1
2 2 0.80 sca1
3 10 0.70 sca1
4 100 0.10 sca1
5 1 0.90 sca2
6 2 0.98 sca2
7 14 0.80 sca2
8 16 0.80 sca2
9 399 0.01 sca2@r2evans代码:
output_bin_LD = df %>%
left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = bin.size)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
data_frame(cut = names(z), count = z)
}
)
) %>%
select(y, tbls) %>%
unnest()这个code up就是这样做的:
y cut count
1 sca1 (0,100] 4
2 sca1 (100,200] 0
3 sca1 (200,300] 0
4 sca2 (0,100] 4
5 sca2 (100,200] 0
6 sca2 (200,300] 0
7 sca2 (300,400] 1最终的目标是
y cut count mean
1 sca1 (0,100] 4 0.575
2 sca1 (100,200] 0 0
3 sca1 (200,300] 0 0
4 sca2 (0,100] 4 0.87
5 sca2 (100,200] 0 0
6 sca2 (200,300] 0 0
7 sca2 (300,400] 1 399到目前为止,我已经尝试过了:
df %>%
left_join(nest(df2, snp, r2, .key = "snp"),
by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = 100)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
a <- mean(cut(xx$r2, breaks))
data_frame(cut = names(z), count = z, mean = a)
} # .f
) # closing pmap
) %>% # mutate
select(y, tbls) %>%
unnest()但是它输出了NA和一个警告信息:
y cut count mean
1 sca1 (0,100] 4 NA
2 sca1 (100,200] 0 NA
3 sca1 (200,300] 0 NA
4 sca2 (0,100] 4 NA
5 sca2 (100,200] 0 NA
6 sca2 (200,300] 0 NA
7 sca2 (300,400] 1 NA
Warning messages:
1: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
2: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA我该怎么解决这个问题?我需要双倍嵌套桌子吗?
发布于 2017-01-19 22:51:49
不确定你的方法,但这里有一个简单明了的方法。如果您感兴趣的话,可以使用data.table包。您需要最新版本(当前为1.10.0)才能工作(因为它是一个新特性)。
require(data.table) ## v1.9.8+
and <- b[a, on=.(sca=y, snp>start, snp<=end), ## 1
.(count=.N, mean=mean(r2, na.rm=TRUE)), ## 2
by=.EACHI] ## 3a中的每一行,在b中查找匹配行索引,同时根据提供给on参数的条件进行匹配。length(matching row indices) == .N给出了count,mean()给出了匹配指数的r2均值。(2)中的表达式针对a中的每一行运行。其中,a是:
require(data.table) ## v1.9.8+
a <- setDT(df)[, .(start=seq(0, x-1, by=bin.size),
end=seq(bin.size, x, by=bin.size)),
by=y]
b <- fread("snp r2 sca
1 0.70 sca1
2 0.80 sca1
10 0.70 sca1
100 0.10 sca1
1 0.90 sca2
2 0.98 sca2
14 0.80 sca2
16 0.80 sca2
399 0.01 sca2")发布于 2017-01-20 00:19:36
这里有一个tidyverse选项,它在dplyr上比purrr更重,这使它更具可读性:
library(tidyverse)
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(), # For each group, count rows
mean = mean(r2)) %>% # and calculate mean
# Add rows for every level of the cuts. Fill new rows with zeros.
complete(cut, fill = list(count = 0L, mean = 0)) %>%
separate(cut, c('from', 'to'), sep = ',') %>% # Split cut into two numbers
mutate_at(vars(from:to), parse_number) %>% # Extract numbers from strings
left_join(df, c(sca = 'y')) %>% # Join to get x value for each group
filter(to <= x) # Subset to rows where the max cut is within the range.
#> Source: local data frame [7 x 6]
#> Groups: sca [2]
#>
#> sca from to count mean x
#> <chr> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 sca1 0 100 4 0.575 300
#> 2 sca1 100 200 0 0.000 300
#> 3 sca1 200 300 0 0.000 300
#> 4 sca2 0 100 4 0.870 400
#> 5 sca2 100 200 0 0.000 400
#> 6 sca2 200 300 0 0.000 400
#> 7 sca2 300 400 1 0.010 400实际上,您可以使用一些正则表达式和子设置来避免连接和干扰裁剪:
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(),
mean = mean(r2)) %>%
complete(cut, fill = list(count = 0L, mean = 0)) %>%
filter(as.integer(gsub('.*,(\\d+).*', '\\1', cut)) <= df$x[unique(sca) == df$y])
#> Source: local data frame [7 x 4]
#> Groups: sca [2]
#>
#> sca cut count mean
#> <chr> <fctr> <int> <dbl>
#> 1 sca1 (0,100] 4 0.575
#> 2 sca1 (100,200] 0 0.000
#> 3 sca1 (200,300] 0 0.000
#> 4 sca2 (0,100] 4 0.870
#> 5 sca2 (100,200] 0 0.000
#> 6 sca2 (200,300] 0 0.000
#> 7 sca2 (300,400] 1 0.010数据
df <- structure(list(x = c(300, 400), y = c("sca1", "sca2")), .Names = c("x",
"y"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(snp = c(1L, 2L, 10L, 100L, 1L, 2L, 14L, 16L, 399L
), r2 = c(0.7, 0.8, 0.7, 0.1, 0.9, 0.98, 0.8, 0.8, 0.01), sca = c("sca1",
"sca1", "sca1", "sca1", "sca2", "sca2", "sca2", "sca2", "sca2"
)), .Names = c("snp", "r2", "sca"), row.names = c(NA, -9L), class = "data.frame")https://stackoverflow.com/questions/41752268
复制相似问题