如何以合理有效的方式获取 .NET 4 中 XElement 的流位置?
1 2 3 4 5 6 7 8
01234567890123456789012345678901234567890123456789012345678901234567890123456789012
<root><group id="0" combiner="or"><filter id="1" /><filter id="2" /></group></root>
我想从上面创建一个到段的映射
{ { "/root", Segment(0 , 82) },
{ "/root/group-0", Segment(6 , 75) },
{ "/root/group-0/filter-1", Segment(34, 50) },
{ "/root/group-0/filter-2", Segment(51, 67) } }
注意事项
- 段的第二个字段可以是长度而不是结束索引
- 方法可以更通用/扩展到其他字节表示
关于我的答案的博文和内存分析屏幕截图
http://corsis.posterous.com/xml-keyvalue-cache-optimizations
奖金
- 使用一种压缩形式,允许O(1) 次访问 元素但 只需要整体的一个副本没有任何子元素在内存中重复的文档。
奖励示例
store["/root"].Decompress() **O(1)**
store["/root/group-0"].Decompress() **O(1)**
最佳答案
这是我最初的尝试:
using System;
using System.Collections.Generic;
using System.Collections.Concurrent;
using System.Linq;
using System.IO;
using System.Xml;
using System.Xml.Linq;
using System.Text;
namespace XMLTest
{
public struct Segment
{
public Segment(long index, long length)
{
Index = index;
Length = length;
}
public long Index;
public long Length;
public override string ToString()
{
return string.Format("Segment({0}, {1})", Index, Length);
}
}
public static class GeneralSerializationExtensions
{
public static string Segment(this string buffer, Segment segment)
{
return buffer.Substring((int)segment.Index, (int)segment.Length);
}
public static byte[] Bytes(this Stream stream, int startIndex = 0, bool setBack = false)
{
var bytes = new byte[stream.Length];
if (stream.CanSeek && stream.CanRead)
{
var position = stream.Position;
stream.Seek(startIndex, SeekOrigin.Begin);
stream.Read(bytes, 0, (int)stream.Length);
if (setBack)
stream.Position = position;
}
return bytes;
}
}
class Program
{
static void Main(string[] args)
{
var stream = new MemoryStream();
var element = XElement.Parse(@"<root><group id=""0"" combiner=""or""><filter id=""1"" /><filter id=""2"" /></group></root>");
//var element = XElement.Parse("<a>i<b id='1' o='2' p=''/><b id='2'><c /></b><b id='3' /><b id='4' o='u'>2</b></a>");
var pie = new PathIndexedXElement(element);
foreach (var path in pie.Paths.OrderBy(p => p))
{
var s = pie.store[path];
var t = pie[path];
Console.WriteLine("> {2,-30} {0,-20} {1}", s, t, path);
}
}
}
public class PathIndexedXElement
{
internal string buffer;
internal ConcurrentDictionary<string, Segment> store;
public PathIndexedXElement(XElement element)
{
buffer = XmlPathSegmenter.StringBuffer(element);
store = element.PathSegments();
}
public IEnumerable<string> Paths
{
get { return store.Keys; }
}
public string this[string path]
{
get { return buffer.Segment(store[path]); }
}
public bool TryGetValue(string path, out string xelement)
{
Segment segment;
if (store.TryGetValue(path, out segment))
{
xelement = buffer.Segment(segment);
return true;
}
xelement = null;
return false;
}
}
public static class XmlPathSegmenter
{
public static XmlWriter CreateWriter(Stream stream)
{
var settings = new XmlWriterSettings() { Encoding = Encoding.UTF8, Indent = false, OmitXmlDeclaration = true, NewLineHandling = NewLineHandling.None };
return XmlWriter.Create(stream, settings);
}
public static MemoryStream MemoryBuffer(XElement element)
{
var stream = new MemoryStream();
var writer = CreateWriter(stream);
element.Save(writer);
writer.Flush();
stream.Position = 0;
return stream;
}
public static string StringBuffer(XElement element)
{
return Encoding.UTF8.GetString(MemoryBuffer(element).Bytes()).Substring(1);
}
public static ConcurrentDictionary<string, Segment> PathSegments(string xmlElement, ConcurrentDictionary<string, Segment> store = null)
{
return PathSegments(XElement.Parse(xmlElement), store);
}
public static ConcurrentDictionary<string, Segment> PathSegments(this XElement element, ConcurrentDictionary<string, Segment> store = null)
{
var stream = new MemoryStream();
var writer = CreateWriter(stream);
element.Save(writer);
writer.Flush();
stream.Position = 0;
return PathSegments(stream, store);
}
public static ConcurrentDictionary<string, Segment> PathSegments(Stream stream, ConcurrentDictionary<string, Segment> store = null)
{
if (store == null)
store = new ConcurrentDictionary<string, Segment>();
var stack = new ConcurrentStack<KeyValuePair<string, int>>();
PathSegments(stream, stack, store);
return store;
}
//
static void PathSegments(Stream stream, ConcurrentStack<KeyValuePair<string, int>> stack, ConcurrentDictionary<string, Segment> store)
{
var reader = XmlReader.Create(stream, new XmlReaderSettings() { });
var line = reader as IXmlLineInfo;
while (reader.Read())
{
KeyValuePair<string, int> ep;
ok:
if (reader.IsStartElement())
{
stack.TryPeek(out ep);
stack.Push(new KeyValuePair<string, int>(ep.Key + Path(reader), line.LinePosition - 2));
}
if (reader.IsEmptyElement)
{
var name = reader.LocalName;
var d = reader.Depth;
reader.Read();
if (stack.TryPop(out ep))
{
var length = line.LinePosition - 2 - ep.Value - (d > reader.Depth ? 1 : 0);
Console.WriteLine("/{3}|{0} : {1} -> {2}", name, ep.Value, length, line.LineNumber);
store.TryAdd(ep.Key, new Segment(ep.Value, length));
}
goto ok;
}
if (reader.NodeType == XmlNodeType.EndElement)
{
if (stack.TryPop(out ep))
{
var length = line.LinePosition + reader.LocalName.Length - ep.Value;
Console.WriteLine("|{3}|{0} : {1} -> {2}", reader.LocalName, ep.Value, length, line.LineNumber);
store.TryAdd(ep.Key, new Segment(ep.Value, length));
}
}
}
}
//
public static string Path(XmlReader element)
{
if (!(element.IsStartElement() || element.IsEmptyElement))
return null;
if (!element.HasAttributes)
return "/" + element.LocalName;
var id = element.GetAttribute("id");
return string.Format(id == null ? "/{0}" : "/{0}-{1}", element.LocalName, id);
}
}
}
输出:
/1|filter : 34 -> 17
/1|filter : 51 -> 17
|1|group : 6 -> 70
|1|root : 0 -> 83
> /root Segment(0, 83) <root><group id="0" combiner="or"><filter id="1" /><filter id="2" /></group></root>
> /root/group-0 Segment(6, 70) <group id="0" combiner="or"><filter id="1" /><filter id="2" /></group>
> /root/group-0/filter-1 Segment(34, 17) <filter id="1" />
> /root/group-0/filter-2 Segment(51, 17) <filter id="2" />
插入者正在发现 IXmlLineInfo由 XmlReader 类显式实现的接口(interface),这是一条很难找到的信息。
注意事项
在我收到关于 this question 的所有评论之后,现在有点预防 :) :
- 集合的并发版本在此示例中不起作用。我知道并乐于使用它们 :)
- 路径方案可以很容易地推广,但这涵盖了我的所有需求。
- 我知道 id 通常用作文档范围内的唯一标识符,我很高兴在这个特定的上下文中使用它们。
- 可以很容易地扩展段,使另一个长度属性指向开始标记的结束 > 符号,从而可以仅提取文档树中任何给定元素的属性,以上下文重建任何其他元素目标元素。对于浅树,这应该提供一个很好的常量因子来访问目标元素以及上下文信息。
- 我完全清楚所有这些可能值得也可能不值得的尝试:我还没有我的场景的任何数字。只是想开发一种方法并与人们分享。
关于c# - 如何在 .NET 中获取 xml 元素的流位置,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/3613713/