c# - 在 C# 中使用 IFilter 并从数据库而不是文件系统中检索文件

标签 c# ifilter

对于 C# 网络应用程序,我想从存储在数据库中的 PDF、DOC 等文件中索引文本。

我一直在试验 an IFilter example on Code Project这对文件系统中的文件非常有效,但我的文件存储在 MS-SQL 数据库中。

任何人都可以帮我找到一个示例以从存储在数据库中的文件中提取文本,或者知道如何修改代码项目代码以使用数据库而不是文件系统吗?

最佳答案

终于在几个小时后我想出了如何使这个工作!我需要对存储在数据库中的 PDF 内容运行 IFilter,并且我想避免将数据保存到临时文件中。

首先我尝试使用 BindIFilterFromStream为存储在 Stream 中的内容创建 IFilter 的 API,但它似乎不能正常工作(至少在这种情况下不能)。所以不要走那条路。

相反,您需要为文件扩展名创建一个 IFilter(或以其他方式访问它)。然后你可以访问 IPersistStream COM 接口(interface)并使用它将 PDF 内容加载到 IFilter 中。其余工作与文件相同。但是,请注意 IPersistStream API 可能并非由每个 IFilter 实现。不过它适用于 Adob​​e PDF IFilter。

代码应如下所示(我删除了一些返回码检查以使代码更具可读性,但是,您应该检查所有可能的返回码)。

private string ParseIFilter(Stream s)
{
   // Get an IFilter for a file or file extension
   IFilter filter = null;
   FilterReturnCodes result = NativeMethods.LoadIFilter(".pdf", null, ref filter);
   if (result != FilterReturnCodes.S_OK)
   {
      Marshal.ThrowExceptionForHR((int)result);
   }

   // Copy the content to global memory
   byte[] buffer = new byte[s.Length];
   s.Read(buffer, 0, buffer.Length);
   IntPtr nativePtr = Marshal.AllocHGlobal(buffer.Length);
   Marshal.Copy(buffer, 0, nativePtr, buffer.Length);

   // Create a COM stream
   System.Runtime.InteropServices.ComTypes.IStream comStream;
   NativeMethods.CreateStreamOnHGlobal(nativePtr, true, out comStream);

   // Load the contents to the iFilter using IPersistStream interface
   var persistStream = (IPersistStream)filter;
   persistStream.Load(comStream);

   // Initialize iFilter
   FilterFlags filterFlags;
   FilterReturnCodes result = filter.Init(
      FilterInit.IFILTER_INIT_INDEXING_ONLY, 0, IntPtr.Zero, out filterFlags);

   return ExtractTextFromIFilter(filter);
}

从过滤器中提取的文本在我的代码中看起来像这样。网络上有很多这方面的示例,可以根据您的需要以多种方式实现。

private string ExtractTextFromIFilter(IFilter filter)
{
   var sb = new StringBuilder();

   while (true)
   {
      StatChunk chunk;
      result = filter.GetChunk(out chunk);

      if (result == FilterReturnCodes.S_OK)
      {
         if (chunk.flags == ChunkState.CHUNK_TEXT)
         {
            sb.Append(ExtractTextFromChunk(filter, chunk));
         }

         continue;
      }

      if (result == FilterReturnCodes.FILTER_E_END_OF_CHUNKS)
      {
         return sb.ToString();
      }

      Marshal.ThrowExceptionForHR((int)result);
   }
}

private virtual string ExtractTextFromChunk(IFilter filter, StatChunk chunk)
{
   var sb = new StringBuilder();

   var result = FilterReturnCodes.S_OK;
   while (result == FilterReturnCodes.S_OK)
   {
      int sizeBuffer = 16384;
      var buffer = new StringBuilder(sizeBuffer);
      result = filter.GetText(ref sizeBuffer, buffer);

      if ((result == FilterReturnCodes.S_OK) || (result == FilterReturnCodes.FILTER_S_LAST_TEXT))
      {
         if((sizeBuffer > 0) && (buffer.Length > 0))
         {
            sb.Append(buffer.ToString(0, sizeBuffer));
         }
      }

      if (result == FilterReturnCodes.FILTER_E_NO_TEXT)
      {
         return string.Empty;
      }

      if ((result == FilterReturnCodes.FILTER_S_LAST_TEXT) || (result == FilterReturnCodes.FILTER_E_NO_MORE_TEXT))
      {
         return sb.ToString();
      }
   }

   return sb.ToString();
}

这里是本地方法的定义和它们使用的结构。

internal static class NativeMethods
{
    [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
    public static extern FilterReturnCodes LoadIFilter(
        string pwcsPath,
        [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter,
        ref IFilter ppIUnk);

    [DllImport("ole32.dll")]
    public static extern int CreateStreamOnHGlobal(IntPtr hGlobal, bool fDeleteOnRelease, out IStream ppstm);
}

[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IFilter
{
    [PreserveSig]
    FilterReturnCodes Init(FilterInit grfFlags, int cAttributes, IntPtr aAttributes, out FilterFlags pdwFlags);

    [PreserveSig]
    FilterReturnCodes GetChunk(out StatChunk pStat);

    [PreserveSig]
    FilterReturnCodes GetText(
        ref int pcwcBuffer,
        [Out, MarshalAs(UnmanagedType.LPWStr)] StringBuilder awcBuffer);

    [PreserveSig]
    FilterReturnCodes GetValue(ref IntPtr propVal);

    [PreserveSig]
    FilterReturnCodes BindRegion(ref FilterRegion origPos, ref Guid riid, ref object ppunk);
}

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("0000010c-0000-0000-C000-000000000046")]
public interface IPersist
{
    void GetClassID(out Guid pClassID);
}

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000109-0000-0000-C000-000000000046")]
public interface IPersistStream : IPersist
{
    new void GetClassID(out Guid pClassID);

    [PreserveSig]
    int IsDirty();

    void Load([In] IStream pStm);

    void Save(
        [In] IStream pStm,
        [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

    void GetSizeMax(out long pcbSize);
}

public struct StatChunk
{
    public int idChunk;
    [MarshalAs(UnmanagedType.U4)]
    public ChunkBreaktype breakType;
    [MarshalAs(UnmanagedType.U4)]
    public ChunkState flags;
    public int locale;
    public FullPropSpec attribute;
    public int idChunkSource;
    public int cwcStartSource;
    public int cwcLenSource;
}

public enum ChunkBreaktype
{
    CHUNK_NO_BREAK = 0,
    CHUNK_EOW = 1,
    CHUNK_EOS = 2,
    CHUNK_EOP = 3,
    CHUNK_EOC = 4
}

public enum ChunkState
{
    CHUNK_TEXT = 0x1,
    CHUNK_VALUE = 0x2,
    CHUNK_FILTER_OWNED_VALUE = 0x4
}

[Flags]
public enum FilterFlags
{
    IFILTER_FLAGS_OLE_PROPERTIES = 1
}

[Flags]
public enum FilterInit
{
    IFILTER_INIT_CANON_PARAGRAPHS = 1,
    IFILTER_INIT_HARD_LINE_BREAKS = 2,
    IFILTER_INIT_CANON_HYPHENS = 4,
    IFILTER_INIT_CANON_SPACES = 8,
    IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
    IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
    IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
    IFILTER_INIT_INDEXING_ONLY = 64,
    IFILTER_INIT_SEARCH_LINKS = 128,
    IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
}

public struct FilterRegion
{
    public int idChunk;
    public int cwcStart;
    public int cwcExtent;
}

public enum FilterReturnCodes : uint
{
    S_OK = 0,
    E_ACCESSDENIED = 0x80070005,
    E_HANDLE = 0x80070006,
    E_INVALIDARG = 0x80070057,
    E_OUTOFMEMORY = 0x8007000E,
    E_NOTIMPL = 0x80004001,
    E_FAIL = 0x80000008,
    FILTER_E_PASSWORD = 0x8004170B,
    FILTER_E_UNKNOWNFORMAT = 0x8004170C,
    FILTER_E_NO_TEXT = 0x80041705,
    FILTER_E_NO_VALUES = 0x80041706,
    FILTER_E_END_OF_CHUNKS = 0x80041700,
    FILTER_E_NO_MORE_TEXT = 0x80041701,
    FILTER_E_NO_MORE_VALUES = 0x80041702,
    FILTER_E_ACCESS = 0x80041703,
    FILTER_W_MONIKER_CLIPPED = 0x00041704,
    FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
    FILTER_E_LINK_UNAVAILABLE = 0x80041708,
    FILTER_S_LAST_TEXT = 0x00041709,
    FILTER_S_LAST_VALUES = 0x0004170A
}

public struct FullPropSpec
{
    public Guid guidPropSet;
    public PropSpec psProperty;
}

[StructLayout(LayoutKind.Explicit)]
public struct PropSpec
{
    [FieldOffset(0)]
    public int ulKind;     

    [FieldOffset(4)]
    public int propid;

    [FieldOffset(4)]
    public IntPtr lpwstr;
}

关于c# - 在 C# 中使用 IFilter 并从数据库而不是文件系统中检索文件,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/7313828/

相关文章:

c# - 获取 ArrayList 中对象的属性值

c# - Angular 发布到 .net Web API

c# - 毒物队列的到期日期

c# - 从c#在mysql中创建表

c# - 从字节数组读取 .rtf(富文本格式)时过滤错误代码 FILTER_E_ACCESS

Windows 搜索 - IFilter 搜索词突出显示

方法参数的 c# 自定义属性 - 它是如何工作的?

pdf - SQL Server 2012-在文件表顶部进行全文搜索-未搜索PDF

pdf - 如何在MSSQL Server 2017中使用Adobe iFilter 11

c++ - 使用 IFilter C++ 获取 OLE 属性