我正在尝试解析通过 TCP 发送到我的 C# 应用程序的 XML 消息。不幸的是,协议(protocol)无法更改,XML 消息没有定界,也没有使用长度前缀。此外,字符编码不固定,但每条消息都以 XML 声明开头 <?xml>
.问题是,我如何使用 C# 一次读取一条 XML 消息。
到目前为止,我尝试将数据从 TCP 流读取到字节数组中并通过 MemoryStream
使用它.问题是,缓冲区可能包含多个 XML 消息,或者第一条消息可能不完整。在这些情况下,我在尝试使用 XmlReader.Read
解析它时遇到异常或 XmlDocument.Load
, 但不幸的是 XmlException
并不能真正让我区分问题(解析本地化错误字符串除外)。
我尝试使用 XmlReader.Read
并计算 Element
的数量和 EndElement
节点。这样我就知道我何时读完了第一条完整的 XML 消息。
但是,有几个问题。如果缓冲区还没有包含整个消息,我如何区分 XmlException
来自一个实际上无效的、格式不正确的消息?换句话说,如果在读取第一个根 EndElement
之前抛出异常,我如何决定是中止错误连接,还是从 TCP 流中收集更多字节?
如果没有异常发生,XmlReader
位于根的开头 EndElement
.类型转换XmlReader
至 IXmlLineInfo
给我当前的 LineNumber
和 LinePosition
,但是获取 EndElement
所在的字节位置并不是直接的真的结束了。为此,我必须将字节数组转换为字符串(使用 XML 声明中指定的编码),查找 LineNumber
。 , LinePosition
并将其转换回字节偏移量。我尝试用 StreamReader.ReadLine
做到这一点, 但流读取器不提供对当前字节位置的公共(public)访问权限。
所有这些接缝都非常不优雅且不坚固。我想知道您是否有更好的解决方案的想法。谢谢。
最佳答案
锁定一段时间后,我想我可以回答我自己的问题如下(我可能错了,欢迎指正):
我找不到方法让
XmlReader
可以继续解析第二条 XML 消息(至少不能,如果第二条消息有XmlDeclaration
)。XmlTextReader.ResetState
可以做类似的事情,但为此我必须假设所有消息的编码相同。因此我无法将XmlReader
直接连接到 TcpStream。关闭
XmlReader
后,缓冲区不在读取器的最后位置。所以不可能关闭阅读器并使用新的阅读器继续下一条消息。我猜这是因为读取器无法成功搜索每个可能的输入流。当
XmlReader
抛出异常时,无法确定它是由于过早的 EOF 还是由于格式不正确的 XML 而发生的。XmlReader.EOF
未设置以防出现异常。作为解决方法,我派生了自己的 MemoryBuffer,它将最后一个字节作为单个字节返回。这样我就知道XmlReader
对最后一个字节很感兴趣,下面的异常可能是由于消息被截断(这有点草率,因为它可能无法检测到每条格式不正确的消息。但是,在将更多字节附加到缓冲区后,迟早会检测到错误。我可以将我的
XmlReader
转换为IXmlLineInfo
接口(interface),它可以访问LineNumber
和LinePosition
当前节点。所以在阅读第一条消息后,我记住了这些位置并用它来截断缓冲区。真正草率的部分来了,因为我必须使用字符编码来获取字节位置。我相信您可以在下面的代码中断处找到测试用例(例如,具有混合编码的内部元素)。但到目前为止,它适用于我的所有测试。
这是我想出的解析器类——它可能有用吗(我知道,它远非完美...)
class XmlParser {
private byte[] buffer = new byte[0];
public int Length {
get {
return buffer.Length;
}
}
// Append new binary data to the internal data buffer...
public XmlParser Append(byte[] buffer2) {
if (buffer2 != null && buffer2.Length > 0) {
// I know, its not an efficient way to do this.
// The EofMemoryStream should handle a List<byte[]> ...
byte[] new_buffer = new byte[buffer.Length + buffer2.Length];
buffer.CopyTo(new_buffer, 0);
buffer2.CopyTo(new_buffer, buffer.Length);
buffer = new_buffer;
}
return this;
}
// MemoryStream which returns the last byte of the buffer individually,
// so that we know that the buffering XmlReader really locked at the last
// byte of the stream.
// Moreover there is an EOF marker.
private class EofMemoryStream: Stream {
public bool EOF { get; private set; }
private MemoryStream mem_;
public override bool CanSeek {
get {
return false;
}
}
public override bool CanWrite {
get {
return false;
}
}
public override bool CanRead {
get {
return true;
}
}
public override long Length {
get {
return mem_.Length;
}
}
public override long Position {
get {
return mem_.Position;
}
set {
throw new NotSupportedException();
}
}
public override void Flush() {
mem_.Flush();
}
public override long Seek(long offset, SeekOrigin origin) {
throw new NotSupportedException();
}
public override void SetLength(long value) {
throw new NotSupportedException();
}
public override void Write(byte[] buffer, int offset, int count) {
throw new NotSupportedException();
}
public override int Read(byte[] buffer, int offset, int count) {
count = Math.Min(count, Math.Max(1, (int)(Length - Position - 1)));
int nread = mem_.Read(buffer, offset, count);
if (nread == 0) {
EOF = true;
}
return nread;
}
public EofMemoryStream(byte[] buffer) {
mem_ = new MemoryStream(buffer, false);
EOF = false;
}
protected override void Dispose(bool disposing) {
mem_.Dispose();
}
}
// Parses the first xml message from the stream.
// If the first message is not yet complete, it returns null.
// If the buffer contains non-wellformed xml, it ~should~ throw an exception.
// After reading an xml message, it pops the data from the byte array.
public Message deserialize() {
if (buffer.Length == 0) {
return null;
}
Message message = null;
Encoding encoding = Message.default_encoding;
//string xml = encoding.GetString(buffer);
using (EofMemoryStream sbuffer = new EofMemoryStream (buffer)) {
XmlDocument xmlDocument = null;
XmlReaderSettings settings = new XmlReaderSettings();
int LineNumber = -1;
int LinePosition = -1;
bool truncate_buffer = false;
using (XmlReader xmlReader = XmlReader.Create(sbuffer, settings)) {
try {
// Read to the first node (skipping over some element-types.
// Don't use MoveToContent here, because it would skip the
// XmlDeclaration too...
while (xmlReader.Read() &&
(xmlReader.NodeType==XmlNodeType.Whitespace ||
xmlReader.NodeType==XmlNodeType.Comment)) {
};
// Check for XML declaration.
// If the message has an XmlDeclaration, extract the encoding.
switch (xmlReader.NodeType) {
case XmlNodeType.XmlDeclaration:
while (xmlReader.MoveToNextAttribute()) {
if (xmlReader.Name == "encoding") {
encoding = Encoding.GetEncoding(xmlReader.Value);
}
}
xmlReader.MoveToContent();
xmlReader.Read();
break;
}
// Move to the first element.
xmlReader.MoveToContent();
if (xmlReader.EOF) {
return null;
}
// Read the entire document.
xmlDocument = new XmlDocument();
xmlDocument.Load(xmlReader.ReadSubtree());
} catch (XmlException e) {
// The parsing of the xml failed. If the XmlReader did
// not yet look at the last byte, it is assumed that the
// XML is invalid and the exception is re-thrown.
if (sbuffer.EOF) {
return null;
}
throw e;
}
{
// Try to serialize an internal data structure using XmlSerializer.
Type type = null;
try {
type = Type.GetType("my.namespace." + xmlDocument.DocumentElement.Name);
} catch (Exception e) {
// No specialized data container for this class found...
}
if (type == null) {
message = new Message();
} else {
// TODO: reuse the serializer...
System.Xml.Serialization.XmlSerializer ser = new System.Xml.Serialization.XmlSerializer(type);
message = (Message)ser.Deserialize(new XmlNodeReader(xmlDocument));
}
message.doc = xmlDocument;
}
// At this point, the first XML message was sucessfully parsed.
// Remember the lineposition of the current end element.
IXmlLineInfo xmlLineInfo = xmlReader as IXmlLineInfo;
if (xmlLineInfo != null && xmlLineInfo.HasLineInfo()) {
LineNumber = xmlLineInfo.LineNumber;
LinePosition = xmlLineInfo.LinePosition;
}
// Try to read the rest of the buffer.
// If an exception is thrown, another xml message appears.
// This way the xml parser could tell us that the message is finished here.
// This would be prefered as truncating the buffer using the line info is sloppy.
try {
while (xmlReader.Read()) {
}
} catch {
// There comes a second message. Needs workaround for trunkating.
truncate_buffer = true;
}
}
if (truncate_buffer) {
if (LineNumber < 0) {
throw new Exception("LineNumber not given. Cannot truncate xml buffer");
}
// Convert the buffer to a string using the encoding found before
// (or the default encoding).
string s = encoding.GetString(buffer);
// Seek to the line.
int char_index = 0;
while (--LineNumber > 0) {
// Recognize \r , \n , \r\n as newlines...
char_index = s.IndexOfAny(new char[] {'\r', '\n'}, char_index);
// char_index should not be -1 because LineNumber>0, otherwise an RangeException is
// thrown, which is appropriate.
char_index++;
if (s[char_index-1]=='\r' && s.Length>char_index && s[char_index]=='\n') {
char_index++;
}
}
char_index += LinePosition - 1;
var rgx = new System.Text.RegularExpressions.Regex(xmlDocument.DocumentElement.Name + "[ \r\n\t]*\\>");
System.Text.RegularExpressions.Match match = rgx.Match(s, char_index);
if (!match.Success || match.Index != char_index) {
throw new Exception("could not find EndElement to truncate the xml buffer.");
}
char_index += match.Value.Length;
// Convert the character offset back to the byte offset (for the given encoding).
int line1_boffset = encoding.GetByteCount(s.Substring(0, char_index));
// remove the bytes from the buffer.
buffer = buffer.Skip(line1_boffset).ToArray();
} else {
buffer = new byte[0];
}
}
return message;
}
}
关于c# - 使用 C# 从 TCP 流中解析串联的、非定界的 XML 消息,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/2942581/