rdf - 将 rdf 三元组加载到 virtuoso 开源中

标签 rdf semantic-web triplestore virtuoso n-triples

我正在尝试创建 LinkedGeoData.org 的本地镜像来自 this dump .

这大约是 61,000,000 个三元组。 Virtuoso 应该可以轻松处理更多,但每次它在大约 40,000,000 个三元组后停止加载。我正在使用来自 Amazon EC2 的双超大实例,它有 30 GB 的 RAM,还有大量的存储空间。我的配置文件有问题吗?我使用的是 ubuntu 服务器 12.04,我尝试通过 apt-get 安装 Virtuoso (版本 6.1.5)并从来自 github(版本 6.1.6)的最新稳定源编译 Jörn Hees' instructions .

我还尝试将转储文件分成更小的部分并一个一个地加载它们。在插入了大约 40,000,000 个三元组后,这也会崩溃。

日志文件没有显示任何奇怪的东西; virtuoso-t只是停止工作而不会真正崩溃,并且 top显示使用 0% CPU 的进程。在前半个小时左右之后,我已经让这个过程运行了几天而没有任何进展。

这是我的 virtuoso.ini文件:

[Database]
DatabaseFile            = /var/lib/virtuoso/db/virtuoso.db
ErrorLogFile            = /var/lib/virtuoso/db/virtuoso.log
LockFile            = /var/lib/virtuoso/db/virtuoso.lck
TransactionFile         = /var/lib/virtuoso/db/virtuoso.trx
xa_persistent_file      = /var/lib/virtuoso/db/virtuoso.pxa
ErrorLogLevel           = 7
FileExtend          = 200
MaxCheckpointRemap      = 625000
Striping            = 0
TempStorage         = TempDatabase


[TempDatabase]
DatabaseFile            = /var/lib/virtuoso/db/virtuoso-temp.db
TransactionFile         = /var/lib/virtuoso/db/virtuoso-temp.trx
MaxCheckpointRemap      = 2000
Striping            = 0


;
;  Server parameters
;
[Parameters]
ServerPort          = 1111
LiteMode            = 0
DisableUnixSocket       = 1
DisableTcpSocket        = 0
;SSLServerPort          = 2111
;SSLCertificate         = cert.pem
;SSLPrivateKey          = pk.pem
;X509ClientVerify       = 0
;X509ClientVerifyDepth      = 0
;X509ClientVerifyCAFile     = ca.pem
ServerThreads           = 20
CheckpointInterval      = 60
O_DIRECT            = 0
CaseMode            = 2
MaxStaticCursorRows     = 5000
CheckpointAuditTrail        = 0
AllowOSCalls            = 0
SchedulerInterval       = 10
DirsAllowed         = ., /usr/share/virtuoso/vad, /home/ubuntu/lgd
ThreadCleanupInterval       = 0
ThreadThreshold         = 10
ResourcesCleanupInterval    = 0
FreeTextBatchSize       = 100000
SingleCPU           = 0
VADInstallDir           = /usr/share/virtuoso/vad/
PrefixResultNames               = 0
RdfFreeTextRulesSize        = 100
IndexTreeMaps           = 256
MaxMemPoolSize                  = 200000000
PrefixResultNames               = 0
MacSpotlight                    = 0
IndexTreeMaps                   = 64
;;
;; When running with large data sets, one should configure the Virtuoso
;; process to use between 2/3 to 3/5 of free system memory and to stripe
;; storage on all available disks.
;;
;; Uncomment next two lines if there is 2 GB system memory free
;       NumberOfBuffers          = 170000
;       MaxDirtyBuffers          = 130000
;; Uncomment next two lines if there is 4 GB system memory free
;       NumberOfBuffers          = 340000
;       MaxDirtyBuffers          = 250000
;; Uncomment next two lines if there is 8 GB system memory free
;       NumberOfBuffers          = 680000
;       MaxDirtyBuffers          = 500000
;; Uncomment next two lines if there is 16 GB system memory free
;       NumberOfBuffers          = 1360000
;       MaxDirtyBuffers          = 1000000
;; Uncomment next two lines if there is 32 GB system memory free
       NumberOfBuffers          = 2720000
       MaxDirtyBuffers          = 2000000
;; Uncomment next two lines if there is 48 GB system memory free
;       NumberOfBuffers          = 4000000
;       MaxDirtyBuffers          = 3000000
;; Uncomment next two lines if there is 64 GB system memory free
;       NumberOfBuffers          = 5450000
;       MaxDirtyBuffers          = 4000000
;;
;; Note the default settings will take very little memory
;; but will not result in very good performance
;;


[HTTPServer]
ServerPort          = 8890
ServerRoot          = /var/lib/virtuoso/vsp
ServerThreads           = 20
DavRoot             = DAV
EnabledDavVSP           = 0
HTTPProxyEnabled        = 0
TempASPXDir         = 0
DefaultMailServer       = localhost:25
ServerThreads           = 10
MaxKeepAlives           = 10
KeepAliveTimeout        = 10
MaxCachedProxyConnections   = 10
ProxyConnectionCacheTimeout = 15
HTTPThreadSize          = 280000
HttpPrintWarningsInOutput   = 0
Charset             = UTF-8
;HTTPLogFile                = logs/http.log

[AutoRepair]
BadParentLinks          = 0

[Client]
SQL_PREFETCH_ROWS       = 100
SQL_PREFETCH_BYTES      = 16000
SQL_QUERY_TIMEOUT       = 0
SQL_TXN_TIMEOUT         = 0
;SQL_NO_CHAR_C_ESCAPE       = 1
;SQL_UTF8_EXECS         = 0
;SQL_NO_SYSTEM_TABLES       = 0
;SQL_BINARY_TIMESTAMP       = 1
;SQL_ENCRYPTION_ON_PASSWORD = -1

[VDB]
ArrayOptimization       = 0
NumArrayParameters      = 10
VDBDisconnectTimeout        = 1000
KeepConnectionOnFixedThread = 0

[Replication]
ServerName          = db-IP-10-252-61-61
ServerEnable            = 1
QueueMax            = 50000


;
;  Striping setup
;
;  These parameters have only effect when Striping is set to 1 in the
;  [Database] section, in which case the DatabaseFile parameter is ignored.
;
;  With striping, the database is spawned across multiple segments
;  where each segment can have multiple stripes.
;
;  Format of the lines below:
;    Segment<number> = <size>, <stripe file name> [, <stripe file name> .. ]
;
;  <number> must be ordered from 1 up.
;
;  The <size> is the total size of the segment which is equally divided
;  across all stripes forming  the segment. Its specification can be in
;  gigabytes (g), megabytes (m), kilobytes (k) or in database blocks
;  (b, the default)
;
;  Note that the segment size must be a multiple of the database page size
;  which is currently 8k. Also, the segment size must be divisible by the
;  number of stripe files forming  the segment.
;
;  The example below creates a 200 meg database striped on two segments
;  with two stripes of 50 meg and one of 100 meg.
;
;  You can always add more segments to the configuration, but once
;  added, do not change the setup.
;
[Striping]
Segment1            = 100M, db-seg1-1.db, db-seg1-2.db
Segment2            = 100M, db-seg2-1.db
;...

;[TempStriping]
;Segment1           = 100M, db-seg1-1.db, db-seg1-2.db
;Segment2           = 100M, db-seg2-1.db
;...

;[Ucms]
;UcmPath            = <path>
;Ucm1               = <file>
;Ucm2               = <file>
;...


[Zero Config]
ServerName          = virtuoso (IP-10-252-61-61)
;ServerDSN          = ZDSN
;SSLServerName          = 
;SSLServerDSN           = 


[Mono]
;MONO_TRACE         = Off
;MONO_PATH          = <path_here>
;MONO_ROOT          = <path_here>
;MONO_CFG_DIR           = <path_here>
;virtclr.dll            =


[URIQA]
DynamicLocal            = 0
DefaultHost         = localhost:8890


[SPARQL]
;ExternalQuerySource        = 1
;ExternalXsltSource         = 1
;DefaultGraph           = http://localhost:8890/dataspace
;ImmutableGraphs            = http://localhost:8890/dataspace
ResultSetMaxRows            = 10000
MaxQueryCostEstimationTime  = 4000  ; in seconds
MaxQueryExecutionTime       = 600   ; in seconds
DefaultQuery                = select distinct ?Concept where {[] a ?Concept} LIMIT 100
DeferInferenceRulesInit     = 0  ; controls inference rules loading
;PingService            = http://rpc.pingthesemanticweb.com/
ShortenLongURIs = 1

[Plugins]
LoadPath            = /usr/lib/virtuoso/hosting
Load1               = plain, wikiv
Load2               = plain, mediawiki
Load3               = plain, creolewiki
Load4           = plain, im

任何帮助是极大的赞赏。

最佳答案

回答我自己的问题。问题是行中的前导空格

   NumberOfBuffers          = 2720000
   MaxDirtyBuffers          = 2000000

删除这些后,Virtuoso 实际上使用了可用内存,而不是默认的 16MB。

关于rdf - 将 rdf 三元组加载到 virtuoso 开源中,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/11946022/

相关文章:

svg - SVG协作工作中的RDF元数据

xml - rdf :resource, rdf:about 和 rdf:ID 的区别

java - 如何将 Jena 规则添加到 OntModel

sparql - 在海龟或 RDF 中,我可以在符合条件的所有主题上添加谓词/宾语吗?

rdf - 有没有办法将RDF词汇表的数据格式转换为SKOS

java - 如何在 JENA 中添加限定基数

rdf - 正确使用rdfs :subPropertyOf

python - 使用存储在 python 字典中的属性添加 RDF 三元组

rdf - CONSTRUCT 子句的实际使用(和重用)

aggregate - Sparql group_concat - 停止排序