使用以下基准:
def create_genome
"gattaca" * 100
end
def count_frequency_using_chars(sequence)
100000.times do
sequence.chars.group_by{|x| x}.map{|letter, array| [letter, array.count]}
end
end
def count_frequency_using_count(sequence)
100000.times do
["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
end
end
sequence = create_genome
count_frequency_using_chars(sequence)
count_frequency_using_count(sequence)
我发现,在基于 C 的 Ruby 1.8 和 1.9.2 中,使用 String#count(letter)
比使用 Enumerable# 对它们进行排序和计数快大约 50 倍group_by
和 Array#count
。我对此感到有些惊讶,因为 String#count
方法每次迭代读取字符串四次,而后者只读取一次。
我尝试在 ruby-prof 和 perftools.rb 下运行代码,它们都只是表明 String#chars
占用了 90% 的时间,没有分割那 90% 的时间花费了 % 的时间。
如果我不得不猜测为什么会有差异,我会说创建 7000 万个单字符字符串的成本很高,但我怎么知道呢? (更新:String#chars
不是罪魁祸首 - 请参阅 mainly_execute_a_trivial_block
的基准测试)
编辑:当前使用 1.9.2 补丁级别 180 的基准测试:
require 'pp'
require 'benchmark'
def create_genome
"gattaca" * 100
end
ZILLION = 100000
def count_frequency_using_count(sequence)
ZILLION.times do
["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
end
end
def count_frequency_using_chars(sequence)
ZILLION.times do
sequence.chars.group_by{|x| x}.map{|letter, array| [letter, array.count]}
end
end
def count_frequency_using_inject_hash(sequence)
ZILLION.times do
sequence.chars.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
end
end
def count_frequency_using_each_with_object(sequence)
ZILLION.times do
sequence.chars.each_with_object(Hash.new(0)) { |char, hash| hash[char] += 1}
end
end
def just_group_by(sequence)
ZILLION.times do
sequence.chars.group_by{|x| x}
end
end
def just_chars_and_trivial_block(sequence)
ZILLION.times do
sequence.chars() {}
end
end
def mainly_execute_a_trivial_block(sequence)
ZILLION.times do
sequence.length.times() {}
end
end
def execute_an_empty_loop_instead(sequence)
ZILLION.times do
i = 0
max = sequence.length
until i == max
i += 1
end
end
end
sequence = create_genome
puts RUBY_VERSION
Benchmark.bm do |benchmark|
benchmark.report do
count_frequency_using_count(sequence)
end
benchmark.report do
count_frequency_using_chars(sequence)
end
benchmark.report do
count_frequency_using_inject_hash(sequence)
end
benchmark.report do
count_frequency_using_each_with_object(sequence)
end
benchmark.report do
just_group_by(sequence)
end
benchmark.report do
just_chars_and_trivial_block(sequence)
end
benchmark.report do
mainly_execute_a_trivial_block(sequence)
end
benchmark.report do
execute_an_empty_for_loop_instead(sequence)
end
end
结果:
user system total real
0.510000 0.000000 0.510000 ( 0.508499) # count_frequency_using_count
23.470000 0.030000 23.500000 ( 23.575487) # count_frequency_using_chars
32.770000 0.050000 32.820000 ( 32.874634) # count_frequency_using_inject_hash
31.880000 0.040000 31.920000 ( 31.942437) # count_frequency_using_each_with_object
22.950000 0.030000 22.980000 ( 22.970500) # just_group_by
13.300000 0.020000 13.320000 ( 13.314466) # just_chars_and_trivial_block
5.660000 0.000000 5.660000 ( 5.661516) # mainly_execute_a_trivial_block
1.930000 0.010000 1.940000 ( 1.934861) # execute_an_empty_loop_instead
最佳答案
它与 ruby 内部结构无关。您正在将苹果与橙子进行比较。
在您的第一个示例中,您将 700 个字符字符串分组 100000 次并计算计数。所以这是你逻辑上的问题。不是在计算。在第二种方法中,你只是在计算,
在这两种方法中,您只使用计数
把第一个例子改成这样
def count_frequency_using_chars(sequence)
grouped = sequence.chars.group_by{|x| x}
100000.times do
grouped.map{|letter, array| [letter, array.count]}
end
end
它和你的秒一样快
编辑
这种方法比 count_frequency_using_count
快 3 倍,检查基准测试
def count_frequency_using_chars_with_single_group(sequence)
grouped = sequence.chars.group_by{|x| x}
100000.times do
grouped.map{|letter, array| [letter, array.count]}
end
end
def count_frequency_using_count(sequence)
100000.times do
["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
end
end
Benchmark.bm do |benchmark|
benchmark.report do
pp count_frequency_using_chars_with_single_group(sequence)
end
benchmark.report do
pp count_frequency_using_count(sequence)
end
end
user system total real
0.410000 0.000000 0.410000 ( 0.419100)
1.330000 0.000000 1.330000 ( 1.324431)
安德鲁对你的评论,
每次测量 100000 个序列的字符组成,而不是测量一个序列 100000 次的字符组成
,您的计数方法仍然比 group_by 方法慢。正如你所说,我只是对大字符串进行了基准测试
seq = "gattaca" * 10000
#seq length is 70000
arr_seq = (1..10).map {|x| seq}
#10 seq items
并改变了处理多个序列的方法
def count_frequency_using_chars_with_single_group(sequences)
sequences.each do |sequence|
grouped = sequence.chars.group_by{|x| x}
100000.times do
grouped.map{|letter, array| [letter, array.count]}
end
end
end
def count_frequency_using_count(sequence)
sequences.each do |sequence|
100000.times do
["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
end
end
end
Benchmark.bm do |benchmark|
benchmark.report do
pp count_frequency_using_chars_with_single_group(arr_seq)
end
benchmark.report do
pp count_frequency_using_count(arr_seq)
end
end
处理100000次,10个70000长度的序列
user system total real
3.710000 0.040000 3.750000 ( 23.452889) #modified group_by approach
1039.180000 6.920000 1046.100000 (1071.374730) #simple char count approach
对于大量字符串,您的简单字符计数方法比修改后的 group_by 方法慢 47%。我只对 10 个序列运行了上述基准测试,每个序列的长度为 70000。假设这是 100 或 1000 个序列,简单计数永远不会成为一种选择。对吧?
关于ruby - 为什么在 Ruby 中使用 String#count 比使用 String#chars 计算字母更快?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/6387428/