haskell - 优化 Haskell、管道、attoparsec 和容器中的内存

标签 haskell optimization haskell-pipes

我正在尝试进一步优化我的管道 attoparsec 解析器和存储,但无法降低内存使用率。

给定 account-parser.hs

{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}

import Protolude hiding (for)

import Data.Hashable
import Data.IntMap.Strict (IntMap)
import Data.Vector (Vector)
import Pipes
import Pipes.Parse
import Pipes.Safe (MonadSafe, runSafeT)
import qualified Data.Attoparsec.ByteString.Char8 as AB
import qualified Data.IntMap.Strict as IM
import qualified Data.Vector as Vector
import qualified Pipes.Attoparsec as PA
import qualified Pipes.ByteString as PB
import qualified Pipes.Safe.Prelude as PSP

-- accountid|account-name|contractid|code

data AccountLine = AccountLine {
    _accountId         :: !ByteString,
    _accountName       :: !ByteString,
    _accountContractId :: !ByteString,
    _accountCode       :: !Word32
    } deriving (Show)

type MapCodetoAccountIdIdx = IntMap Int

data Accounts = Accounts {
    _accountIds   :: !(Vector ByteString),
    _cache        :: !(IntMap Int),
    _accountCodes :: !MapCodetoAccountIdIdx
    } deriving (Show)


parseAccountLine :: AB.Parser AccountLine
parseAccountLine = AccountLine <$>
    getSubfield <* delim <*>
    getSubfield <* delim <*>
    getSubfield <* delim <*>
    AB.decimal <* AB.endOfLine
    where getSubfield = AB.takeTill (== '|')
          delim = AB.char '|'

--

aempty :: Accounts
aempty = Accounts Vector.empty IM.empty IM.empty

aappend :: Accounts -> AccountLine -> Accounts
aappend (Accounts ids a2i cps) (AccountLine aid an cid cp) =
    case IM.lookup (hash aid) a2i of
        Nothing -> Accounts
                (Vector.snoc ids (toS aid))
                (IM.insert (hash aid) (length ids) a2i)
                (IM.insert (fromIntegral cp) (length ids) cps)
        Just idx -> Accounts ids a2i (IM.insert (fromIntegral cp) idx cps)

foldAccounts :: (Monad m) => Parser AccountLine m Accounts
foldAccounts = foldAll aappend aempty identity

readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m ()
readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle

accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m ()) ())
accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename))


main :: IO ()
main = do
    [filename] <- getArgs
    x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename))

    print $ sizes x

sizes :: Accounts -> (Int, Int, Int)
sizes (Accounts aid xxx acp) = (Vector.length aid, IM.size xxx, IM.size acp)

用 GHC 8.0.2 ( stack ghc -- -O2 -rtsopts -threaded -Wall account-parser.hs ) 编译

我无法降低内存使用率。因此我必须快速查找 IntMaps。该文件大约为 20 MB(效率不高)。大多数数据应该能够容纳 5 MB。
$ ./account-parser /tmp/accounts +RTS -s
(5837,5837,373998)
   1,631,040,680 bytes allocated in the heap
     221,765,464 bytes copied during GC
      41,709,048 bytes maximum residency (13 sample(s))
       2,512,560 bytes maximum slop
              82 MB total memory in use (0 MB lost due to fragmentation)

                                     Tot time (elapsed)  Avg pause  Max pause
  Gen  0      2754 colls,     0 par    0.105s   0.142s     0.0001s    0.0002s
  Gen  1        13 colls,     0 par    0.066s   0.074s     0.0057s    0.0216s

  TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)

  SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)

  INIT    time    0.000s  (  0.001s elapsed)
  MUT     time    0.324s  (  0.298s elapsed)
  GC      time    0.171s  (  0.216s elapsed)
  EXIT    time    0.000s  (  0.005s elapsed)
  Total   time    0.495s  (  0.520s elapsed)

  Alloc rate    5,026,660,297 bytes per MUT second

  Productivity  65.5% of total user, 58.4% of total elapsed

gc_alloc_block_sync: 0
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0

和个人资料:

enter image description here

最佳答案

如果我,

  • 删除中间查找缓存
  • 使用 HashMap Text (Set Word32)
  • 打开就地压实+RTS -c

  • 我可以将总内存减少到 34 MB,但是我的查找现在变成了 O(n)。这可能是我要得到的最好的。
    {-# LANGUAGE RankNTypes #-}
    {-# LANGUAGE OverloadedStrings #-}
    {-# LANGUAGE NoImplicitPrelude #-}
    
    import           Protolude hiding (for)
    
    import qualified Data.Attoparsec.ByteString.Char8 as AB
    import           Data.HashMap.Strict (HashMap)
    import qualified Data.HashMap.Strict as HashMap
    import           Data.Set (Set)
    import qualified Data.Set as Set
    import           Pipes
    import qualified Pipes.Attoparsec as PA
    import qualified Pipes.ByteString as PB
    import           Pipes.Parse
    import           Pipes.Safe (MonadSafe, runSafeT)
    import qualified Pipes.Safe.Prelude as PSP
    
    -- accountid|account-name|contractid|code
    
    data AccountLine = AccountLine {
        _accountId         :: !ByteString,
        _accountName       :: !ByteString,
        _accountContractId :: !ByteString,
        _accountCode       :: !Word32
        } deriving (Show)
    
    
    newtype Accounts = Accounts (HashMap Text (Set Word32))
                     deriving (Show)
    
    parseAccountLine :: AB.Parser AccountLine
    parseAccountLine = AccountLine <$>
        getSubfield <* delim <*>
        getSubfield <* delim <*>
        getSubfield <* delim <*>
        AB.decimal <* AB.endOfLine
        where getSubfield = AB.takeTill (== '|')
              delim = AB.char '|'
    
    --
    
    aempty :: Accounts
    aempty = Accounts HashMap.empty
    
    aappend :: Accounts -> AccountLine -> Accounts
    aappend (Accounts cps) (AccountLine aid an cid cp) =
        case HashMap.lookup (toS aid) cps of
            Nothing  -> Accounts (HashMap.insert (toS aid) (Set.singleton cp) cps)
            Just value -> Accounts (HashMap.update (\codes -> Just (Set.insert cp value)) (toS aid) cps)
    
    foldAccounts :: (Monad m) => Parser AccountLine m Accounts
    foldAccounts = foldAll aappend aempty identity
    
    readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m ()
    readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle
    
    accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m ()) ())
    accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename))
    
    
    main :: IO ()
    main = do
        [filename] <- getArgs
        x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename))
    
        print $ sizes x
    
        -- print x
        print $ lookupAccountFromCode x 254741
        print $ lookupAccountFromCode x 196939
    
    
    sizes :: Accounts -> Int
    sizes (Accounts acp) = HashMap.size acp
    
    lookupAccountFromCode :: Accounts -> Word32 -> Maybe Text
    lookupAccountFromCode (Accounts accts) cp = do
        let f a k v = bool a (Just k) (Set.member cp v)
        HashMap.foldlWithKey' f Nothing accts
    

    并运行
    $ ./account-parser /tmp/accounts +RTS -s -c
    5837
    Just "1-PCECJ5"
    Just "AANA-76KOUU"
       1,652,177,904 bytes allocated in the heap
          83,767,440 bytes copied during GC
          17,563,800 bytes maximum residency (18 sample(s))
             751,144 bytes maximum slop
                  34 MB total memory in use (0 MB lost due to fragmentation)
    
                                         Tot time (elapsed)  Avg pause  Max pause
      Gen  0      3083 colls,     0 par    0.058s   0.069s     0.0000s    0.0002s
      Gen  1        18 colls,     0 par    0.115s   0.151s     0.0084s    0.0317s
    
      TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1)
    
      SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
    
      INIT    time    0.000s  (  0.002s elapsed)
      MUT     time    0.263s  (  0.289s elapsed)
      GC      time    0.173s  (  0.219s elapsed)
      EXIT    time    0.009s  (  0.008s elapsed)
      Total   time    0.445s  (  0.518s elapsed)
    
      Alloc rate    6,286,682,587 bytes per MUT second
    
      Productivity  61.0% of total user, 57.4% of total elapsed
    
    gc_alloc_block_sync: 0
    whitehole_spin: 0
    gen[0].sync: 0
    gen[1].sync: 0
    

    关于haskell - 优化 Haskell、管道、attoparsec 和容器中的内存,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44660738/

    相关文章:

    c++ - Haskell FFI内存分配性能问题

    使用C将字符串常量转换为数值

    python - 根据索引前缀在列中设置值的最有效方法

    parsing - "Sub-parsers"在管道-attoparsec

    haskell - 管道 : open a file according to content of another

    haskell - 流媒体库中惯用的预取

    haskell - 从 Q monad 中提取值(value)环境

    haskell - 如何在 Haskell 中查找运行时的核心数量

    list - 根据 Haskell 中的索引交换列表列表中的两个元素

    c++ - 打开 g++ 优化会导致段错误 - 我不明白