作为练习,我写了 Peter Norvig 的 spelling corrector algorithm在 haskell :
module Spl (nwords, correct)
where
import Data.Char (toLower)
import Data.Ord (comparing)
import Data.List (maximumBy, splitAt, foldl')
import Text.Regex.TDFA (getAllTextMatches, (=~))
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
type NWords = Map.Map String Int
alphabet :: String
alphabet = enumFromTo 'a' 'z'
nwords :: String -> Map.Map String Int
nwords = train . words'
uniqueElems :: Ord a => [a] -> [a]
uniqueElems = uniq' Set.empty
where uniq' _ [] = []
uniq' seen (x:xs)
| x `Set.member` seen = uniq' seen xs
| otherwise = x:uniq' (x `Set.insert` seen) xs
words' :: String -> [String]
words' = getAllTextMatches . flip (=~) "[a-z]+" . map toLower
train :: [String] -> NWords
train = foldl' populate Map.empty
where populate m feature = Map.insertWith (+) feature 1 m
edits :: String -> [String]
edits word = uniqueElems $ concat [dels, trans, repl, ins]
where dels = [a ++ tail b | (a,b) <- splits, nn b]
trans = [ a ++ (b!!1):head b:tail (tail b) | (a,b) <- splits
, length b > 1]
repl = [a ++ c:tail b | (a,b) <- splits, c <- alphabet, nn b]
ins = [a ++ c:b | (a,b) <- splits, c <- alphabet]
splits = [splitAt n word | n <- [0..length word]]
nn = not . null
knownEdits :: NWords -> String -> [String]
knownEdits nw word = uniqueElems [ e2 | e1 <- edits word, e2 <- edits e1
, Map.member e2 nw]
known :: NWords -> [String] -> [String]
known nw = uniqueElems . filter (`Map.member` nw)
correct :: NWords -> String -> String
correct nw word = fst $ maximumBy (comparing snd) candidates
where candidates = [(w, Map.findWithDefault 0 w nw) | w <- result]
result = head $ filter (not . null) start
start = [ known nw [word], known nw $ edits word
, knownEdits nw word , [word]]
用法
这就是我使用它的方式:
ghci> t <- readFile "big.txt"
ghci> let nw = nwords t
ghci> correct nw "speling"
"spelling"
Peter Norvig 的网站上提供了 big.txt
文件(直接链接,6.2MB):
http://norvig.com/big.txt
问题
- 构建单词 map 需要很长时间。
train
函数比words'
慢得多,因此它是瓶颈。 - 内存使用率太疯狂了。玩了一段时间后,我的容量几乎达到了 1 GB。
那么,我哪里弄错了?我是否有内存泄漏?
最佳答案
我的主要建议是:
- 使用高效的字符串类型(即 Text/ByteString 或其惰性变体)
- 使用更好的 HashMap 实现 - 例如
Data.HashMap.Strict
- 编写自定义单词解析器而不是使用正则表达式
以下代码可以在大约 2 秒内将所有 big.txt
加载到 Data.Hashmap.Strict
中。内存使用量约为 25 MB(在 64 位系统上):
import qualified Data.HashMap.Strict as H
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.List
isAlpha ch = ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')
wrds :: T.Text -> [ T.Text ]
wrds bs =
let
(_, r1) = T.span (not . isAlpha) bs
(w, r2) = T.span isAlpha r1
in if T.null w then [] else T.toLower w : wrds r2
readDict = do
allwords <- fmap wrds $ T.readFile "big.txt"
let h = foldl' add H.empty all words
add h w = let c = H.lookupDefault (0 :: Int) w h
in H.insert w (c+1) h
member = \k -> H.member k h
frequency = \k -> H.lookupDefault 0 k h
return (member, frequency)
使用惰性文本可能会更有效 - 有待研究。
这是我的其余实现 - 几乎遵循 Norvig,但我做了一些您可能会感兴趣的其他选择:
{-# LANGUAGE OverloadedStrings #-}
module SpellText
where
import qualified Data.Text as T
import Data.Text (Text)
import Data.Monoid
import Data.List.Ordered (nubSort)
import Data.Ord
import Data.List
import Control.Monad
type Dict = ( Text -> Bool, Text -> Int )
singles :: [ Text ]
singles = map T.singleton ['a'..'z']
edits :: Text -> [ Text ]
edits w = deletes <> nubSort (transposes <> replaces) <> inserts
where
splits = zip (T.inits w) (T.tails w)
deletes = [ a <> (T.drop 1 b) | (a,b) <- splits, T.length b > 0 ]
transposes = [ a <> c <> (T.drop 2 b) | (a,b) <- splits, T.length b > 1,
let c = T.pack [ T.index b 1, T.index b 0 ] ]
replaces = [ a <> c <> (T.drop 1 b) | (a,b) <- splits, T.length b > 1,
c <- singles ]
inserts = [ a <> c <> b | (a,b) <- splits, c <- singles ]
orElse :: [a] -> [a] -> [a]
orElse [] bs = bs
orElse as _ = as
-- | Correct a word. 'isMember' and 'frequency' are functions to
-- determine if a word is in the dictionary and to lookup its
-- frequency, respectively.
correct :: Dict -> Text -> Text
correct (isMember,frequency) w0 =
let ed0 = [ w0 ]
ed1 = edits w0
ed2 = [ e2 | e1 <- ed1, e2 <- edits e1 ]
kn0 = filter isMember ed0
kn1 = filter isMember ed1
kn2 = filter isMember ed2
candidates = kn0 `orElse` (kn1 `orElse` (kn2 `orElse` [w0]))
in maximumBy (comparing frequency) candidates
用法如下:
{-# LANGUAGE OverloadedStrings #-}
import ... -- import the above code
main = do
dictfns <- readDict
print $ correct dictfns "howwa"
我测得的校正时间与 Python 版本相当 - 可能快了 10%。
关于Peter Norvig 的拼写校正器的 Haskell 版本慢得令人难以置信,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32570604/