ruby - 为什么在 Ruby 中使用 String#count 比使用 String#chars 计算字母更快？

使用以下基准:

def create_genome
  "gattaca" * 100
end

def count_frequency_using_chars(sequence)
  100000.times do
    sequence.chars.group_by{|x| x}.map{|letter, array| [letter, array.count]}
  end
end

def count_frequency_using_count(sequence)
  100000.times do
    ["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
  end
end

sequence = create_genome
count_frequency_using_chars(sequence)
count_frequency_using_count(sequence)

我发现，在基于 C 的 Ruby 1.8 和 1.9.2 中，使用 String#count(letter) 比使用 Enumerable# 对它们进行排序和计数快大约 50 倍group_by 和 Array#count。我对此感到有些惊讶，因为 String#count 方法每次迭代读取字符串四次，而后者只读取一次。

我尝试在 ruby-prof 和 perftools.rb 下运行代码，它们都只是表明 String#chars 占用了 90% 的时间，没有分割那 90% 的时间花费了 % 的时间。

如果我不得不猜测为什么会有差异，我会说创建 7000 万个单字符字符串的成本很高，但我怎么知道呢？ (更新:String#chars 不是罪魁祸首 - 请参阅 mainly_execute_a_trivial_block 的基准测试)

编辑:当前使用 1.9.2 补丁级别 180 的基准测试:

require 'pp'
require 'benchmark'

def create_genome
  "gattaca" * 100
end

ZILLION = 100000

def count_frequency_using_count(sequence)
  ZILLION.times do
    ["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
  end
end

def count_frequency_using_chars(sequence)
  ZILLION.times do
    sequence.chars.group_by{|x| x}.map{|letter, array| [letter, array.count]}
  end
end

def count_frequency_using_inject_hash(sequence)
  ZILLION.times do
     sequence.chars.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
  end
end

def count_frequency_using_each_with_object(sequence)
  ZILLION.times do
     sequence.chars.each_with_object(Hash.new(0)) { |char, hash| hash[char] += 1}
  end
end


def just_group_by(sequence)
  ZILLION.times do
    sequence.chars.group_by{|x| x}
  end
end

def just_chars_and_trivial_block(sequence)
  ZILLION.times do
    sequence.chars() {}
  end
end

def mainly_execute_a_trivial_block(sequence)
  ZILLION.times do
    sequence.length.times() {}
  end
end

def execute_an_empty_loop_instead(sequence)
  ZILLION.times do
    i = 0
    max = sequence.length
    until i == max
      i += 1
    end
  end
end

sequence = create_genome

puts RUBY_VERSION

Benchmark.bm do |benchmark|
  benchmark.report do
    count_frequency_using_count(sequence)
  end
  benchmark.report do
    count_frequency_using_chars(sequence)
  end
  benchmark.report do
    count_frequency_using_inject_hash(sequence)
  end
  benchmark.report do
    count_frequency_using_each_with_object(sequence)
  end
  benchmark.report do
    just_group_by(sequence)
  end
  benchmark.report do
    just_chars_and_trivial_block(sequence)
  end
  benchmark.report do
    mainly_execute_a_trivial_block(sequence)
  end
  benchmark.report do
    execute_an_empty_for_loop_instead(sequence)
  end
end

结果:

     user     system      total        real
 0.510000   0.000000   0.510000 (  0.508499) # count_frequency_using_count
23.470000   0.030000  23.500000 ( 23.575487) # count_frequency_using_chars
32.770000   0.050000  32.820000 ( 32.874634) # count_frequency_using_inject_hash
31.880000   0.040000  31.920000 ( 31.942437) # count_frequency_using_each_with_object
22.950000   0.030000  22.980000 ( 22.970500) # just_group_by
13.300000   0.020000  13.320000 ( 13.314466) # just_chars_and_trivial_block
 5.660000   0.000000   5.660000 (  5.661516) # mainly_execute_a_trivial_block
 1.930000   0.010000   1.940000 (  1.934861) # execute_an_empty_loop_instead

最佳答案

它与 ruby 内部结构无关。您正在将苹果与橙子进行比较。

在您的第一个示例中，您将 700 个字符字符串分组 100000 次并计算计数。所以这是你逻辑上的问题。不是在计算。在第二种方法中，你只是在计算，

在这两种方法中，您只使用计数

把第一个例子改成这样

def count_frequency_using_chars(sequence)
  grouped = sequence.chars.group_by{|x| x}
  100000.times do
    grouped.map{|letter, array| [letter, array.count]}
  end
end

它和你的秒一样快

编辑

这种方法比 count_frequency_using_count 快 3 倍，检查基准测试

  def count_frequency_using_chars_with_single_group(sequence)
    grouped = sequence.chars.group_by{|x| x}
      100000.times do
        grouped.map{|letter, array| [letter, array.count]}
      end
    end

    def count_frequency_using_count(sequence)
      100000.times do
        ["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
      end
    end

Benchmark.bm do |benchmark|
  benchmark.report do
    pp count_frequency_using_chars_with_single_group(sequence)
  end
  benchmark.report do
    pp count_frequency_using_count(sequence)
  end
end


    user     system      total        real
  0.410000   0.000000   0.410000 (  0.419100)
  1.330000   0.000000   1.330000 (  1.324431)

安德鲁对你的评论，

每次测量 100000 个序列的字符组成，而不是测量一个序列 100000 次的字符组成，您的计数方法仍然比 group_by 方法慢。正如你所说，我只是对大字符串进行了基准测试

seq = "gattaca" * 10000
#seq length is 70000

arr_seq = (1..10).map {|x| seq}
#10 seq items

并改变了处理多个序列的方法

def count_frequency_using_chars_with_single_group(sequences)
  sequences.each do |sequence|
    grouped = sequence.chars.group_by{|x| x}
    100000.times do
      grouped.map{|letter, array| [letter, array.count]}
    end
  end
end

def count_frequency_using_count(sequence)
  sequences.each do |sequence|
    100000.times do
      ["a", "c", "g", "t"].map{|letter| sequence.count(letter)}
    end
  end
end


Benchmark.bm do |benchmark|
  benchmark.report do
    pp count_frequency_using_chars_with_single_group(arr_seq)
  end
  benchmark.report do
    pp count_frequency_using_count(arr_seq)
  end
end

处理100000次，10个70000长度的序列

  user        system      total        real
 3.710000     0.040000    3.750000     ( 23.452889)   #modified group_by approach
 1039.180000  6.920000    1046.100000  (1071.374730) #simple char count approach

对于大量字符串，您的简单字符计数方法比修改后的 group_by 方法慢 47%。我只对 10 个序列运行了上述基准测试，每个序列的长度为 70000。假设这是 100 或 1000 个序列，简单计数永远不会成为一种选择。对吧？

关于ruby - 为什么在 Ruby 中使用 String#count 比使用 String#chars 计算字母更快？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/6387428/

ruby - 为什么在 Ruby 中使用 String#count 比使用 String#chars 计算字母更快？

安德鲁对你的评论，

上一篇：ruby - 液体错误 : wrong number of arguments

下一篇：ruby-on-rails - rails : Create child/parent relationship