Re: [問題] data.table取出符合條件的資料
我有興趣知道size較大時,哪個速度比較快
寫了一個小程式測試:
library(data.table)
library(dplyr)
library(fastmatch)
library(Rcpp)
library(microbenchmark)
library(rbenchmark)
perf_test = function(N){
    tmp <- list()
    for(i in 1:N) tmp[[i]] <- iris
    m <- do.call(rbind, tmp)
    m2 = data.table(m)
    setkey(m2, "Sepal.Width")
    m3 = as.matrix(m[,1:4])
    benchmark(replications=100,
        m[m$Sepal.Width == 3.5,],
        subset(m, Sepal.Width == 3.5),
        m2[J(3.5)],
        filter(m, Sepal.Width == 3.5),
        filter(m2, Sepal.Width == 3.5),
        m2[list(3.5)],
        m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0L),],
        m3[m3[,2]==3.5,],
        columns = c("test", "replications", "elapsed", "relative")
    )
}
# iris的大小
object.size(iris)
# 7088 bytes
# 200倍的資料量
perf_test(200)
                                          test replications elapsed relative
4                filter(m, Sepal.Width == 3.5)          100    0.05      1.0
5               filter(m2, Sepal.Width == 3.5)          100    0.14      2.8
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ]          100    0.25      5.0
1                    m[m$Sepal.Width == 3.5, ]          100    0.44      8.8
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)]          100    0.33      6.6
3                                   m2[J(3.5)]          100    0.17      3.4
6                                m2[list(3.5)]          100    0.14      2.8
9                         m3[m3[, 2] == 3.5, ]          100    0.22      4.4
2                subset(m, Sepal.Width == 3.5)          100    0.55     11.0
# 500倍的資料量
perf_test(500)
                                     test replications elapsed relative
4                filter(m, Sepal.Width == 3.5)          100    0.15    1.000
5               filter(m2, Sepal.Width == 3.5)          100    0.16    1.067
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ]          100    0.71    4.733
1                    m[m$Sepal.Width == 3.5, ]          100    1.13    7.533
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)]          100    0.75    5.000
3                                   m2[J(3.5)]          100    0.19    1.267
6                                m2[list(3.5)]          100    0.16    1.067
9                         m3[m3[, 2] == 3.5, ]          100    0.50    3.333
2                subset(m, Sepal.Width == 3.5)          100    1.26    8.400
# 1000倍的資料量
perf_test(1000)
                                          test replications elapsed relative
4                filter(m, Sepal.Width == 3.5)          100    0.27    1.929
5               filter(m2, Sepal.Width == 3.5)          100    0.21    1.500
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ]          100    1.09    7.786
1                    m[m$Sepal.Width == 3.5, ]          100    1.92   13.714
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)]          100    0.97    6.929
3                                   m2[J(3.5)]          100    0.15    1.071
6                                m2[list(3.5)]          100    0.14    1.000
9                         m3[m3[, 2] == 3.5, ]          100    0.83    5.929
2                subset(m, Sepal.Width == 3.5)          100    2.31   16.500
# 1500倍的資料量
perf_test(1500)
                                          test replications elapsed relative
4                filter(m, Sepal.Width == 3.5)          100    0.45     2.25
5               filter(m2, Sepal.Width == 3.5)          100    0.31     1.55
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ]          100    1.76     8.80
1                    m[m$Sepal.Width == 3.5, ]          100    3.11    15.55
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)]          100    1.81     9.05
3                                   m2[J(3.5)]          100    0.20     1.00
6                                m2[list(3.5)]          100    0.21     1.05
9                         m3[m3[, 2] == 3.5, ]          100    2.06    10.30
2                subset(m, Sepal.Width == 3.5)          100    3.60    18.00
# 3000倍的資料量
perf_test(3000)
                                          test replications elapsed relative
4                filter(m, Sepal.Width == 3.5)          100    0.82     4.10
5               filter(m2, Sepal.Width == 3.5)          100    0.50     2.50
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ]          100    3.47    17.35
1                    m[m$Sepal.Width == 3.5, ]          100    7.13    35.65
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)]          100    3.79    18.95
3                                   m2[J(3.5)]          100    0.20     1.00
6                                m2[list(3.5)]          100    0.22     1.10
9                         m3[m3[, 2] == 3.5, ]          100    2.93    14.65
2                subset(m, Sepal.Width == 3.5)          100    7.39    36.95
# 5000倍的資料量
perf_test(5000)
                                          test replications elapsed relative
4                filter(m, Sepal.Width == 3.5)          100    1.46    5.214
5               filter(m2, Sepal.Width == 3.5)          100    0.84    3.000
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ]          100    6.46   23.071
1                    m[m$Sepal.Width == 3.5, ]          100   10.71   38.250
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)]          100    7.37   26.321
3                                   m2[J(3.5)]          100    0.28    1.000
6                                m2[list(3.5)]          100    0.34    1.214
9                         m3[m3[, 2] == 3.5, ]          100    4.96   17.714
2                subset(m, Sepal.Width == 3.5)          100   13.67   48.821
總結:
在資料量在3544000 bytes左右為分界,以下是filter + data.frame比較快
以上則是m2[J(3.5)] 跟 m2[list(3.5)]比較快
補上平台:windows 7 64 bit SP1, R 3.0.3, i7-3700K@4.3GHz
--
※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 218.164.186.40
※ 文章網址: http://www.ptt.cc/bbs/R_Language/M.1396729914.A.B34.html
推
04/06 04:49, , 1F
04/06 04:49, 1F
補上MATLAB速度 (MATLAB 2013b)
***** perf_test.m *****
function [] = perf_test(N)
m = zeros(4,N*size(iris_dataset,2));
for i = 1:N
    m(:, ((i-1)*size(iris_dataset,2)+1):(size(iris_dataset,2)*i)) =
iris_dataset;
end
time = 0;
for j = 1:100
    t = tic;
    tmp = m(:,m(2,:)==3.5);
    time = time + toc(t);
end
fprintf('Elapsed time is %2.6f seconds.\n', time/100)
***********************
>> perf_test(200)
Elapsed time is 0.000116 seconds.
>> perf_test(500)
Elapsed time is 0.000397 seconds.
>> perf_test(1000)
Elapsed time is 0.000944 seconds.
>> perf_test(1500)
Elapsed time is 0.001334 seconds.
>> perf_test(3000)
Elapsed time is 0.002361 seconds.
>> perf_test(5000)
Elapsed time is 0.004385 seconds.
備註:要錢的果然比較快(攤手
推
04/06 11:26, , 2F
04/06 11:26, 2F
→
04/06 11:28, , 3F
04/06 11:28, 3F
→
04/06 11:33, , 4F
04/06 11:33, 4F
→
04/06 11:35, , 5F
04/06 11:35, 5F
補上大大所說的matrix type,不過比data.table還慢
沒有比較快,是比較慢,更別說要比matlab了
※ 編輯: celestialgod (218.164.186.40), 04/06/2014 12:40:14
→
04/06 20:44, , 6F
04/06 20:44, 6F
討論串 (同標題文章)
完整討論串 (本文為第 4 之 4 篇):
R_Language 近期熱門文章
PTT數位生活區 即時熱門文章
                            32
                        
                            67