Searching/Indexing additional document/mime types
Hi Everyone:
I have been investigating whether SiteFinity can be extended to extract content from additional document/mime types, e.g. power point documents, excel files, etc. I have decompiled the SiteFinity libraries and I have found the following class that is responsible for extraction of text (see below). There seems to be also some configuration elements that register the list of available mime types (I cannot seemt to find the actual config file).
The class itself is internal so I cannot even call it or reference it from my own code, so that's not going to work. The class is called by the DocumentService which is also internal.
Is there a prescribed method for adding an additional ITextExtractor to be used for pulling out text from files - I could easily write one but I have no way to hook it into the default DocumentService?
Suggestions?
internal class DefaultTextExtractor : ITextExtractor
private static object syncLock; private string mimeType; public string MimeType
get
return this.mimeType;
static DefaultTextExtractor()
DefaultTextExtractor.syncLock = new object();
public DefaultTextExtractor()
private static void ExecuteDocumentAction(Action action)
lock (DefaultTextExtractor.syncLock)
Thread thread = new Thread(() =>
try
action();
catch (Exception exception1)
Exception exception = exception1;
exception = new Exception("Error extracting the text content of a document", exception);
Log.Write(exception, ConfigurationPolicy.ErrorLog);
);
thread.SetApartmentState(ApartmentState.STA);
thread.Start();
thread.Join();
private ITextExtractor GetInnerTextExtractor()
string str = this.mimeType;
string str1 = str;
if (str != null)
if (str1 == "text/rtf")
return new DefaultTextExtractor.DefaultRtfTextExtractor();
else
if (str1 == "text/html")
return new DefaultTextExtractor.DefaultHtmlTextExtractor();
else
if (str1 == "text/plain")
return new DefaultTextExtractor.DefaultTxtTextExtractor();
else
if (str1 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
return new DefaultTextExtractor.DefaultDocxTextExtractor();
else
if (str1 == "application/pdf")
return new DefaultTextExtractor.DefaultPdfTextExtractor();
return null;
public void GetText(Stream doc, Stream text)
ITextExtractor innerTextExtractor = this.GetInnerTextExtractor();
if (innerTextExtractor != null)
innerTextExtractor.GetText(doc, text);
return;
else
throw new InvalidOperationException(string.Format("The MIME type '0' is not supported by DefaultTextExtractor.", this.mimeType));
public void Initialize(string mimeType, NameValueCollection config)
if (!string.IsNullOrEmpty(mimeType))
this.mimeType = mimeType;
return;
else
throw new ArgumentException("The default text extractor needs a MIME type.");
private class DefaultDocxTextExtractor : ITextExtractor
private string JustDecompileGenerated_MimeType_k__BackingField; public string JustDecompileGenerated_get_MimeType()
return this.JustDecompileGenerated_MimeType_k__BackingField;
private void JustDecompileGenerated_set_MimeType(string value)
this.JustDecompileGenerated_MimeType_k__BackingField = value;
public string MimeType
get
return JustDecompileGenerated_get_MimeType();
set
JustDecompileGenerated_set_MimeType(value);
public DefaultDocxTextExtractor()
public void GetText(Stream doc, Stream text)
DefaultTextExtractor.ExecuteDocumentAction(() =>
byte[] numArray;
using (BinaryReader binaryReader = new BinaryReader(doc))
numArray = binaryReader.ReadBytes((int)doc.Length);
RadDocument radDocument = (new DocxFormatProvider()).Import(numArray);
(new TxtFormatProvider()).Export(radDocument, text);
);
public void Initialize(string mimeType, NameValueCollection config)
this.MimeType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
private class DefaultHtmlTextExtractor : ITextExtractor
private string JustDecompileGenerated_MimeType_k__BackingField; public string JustDecompileGenerated_get_MimeType()
return this.JustDecompileGenerated_MimeType_k__BackingField;
private void JustDecompileGenerated_set_MimeType(string value)
this.JustDecompileGenerated_MimeType_k__BackingField = value;
public string MimeType
get
return JustDecompileGenerated_get_MimeType();
set
JustDecompileGenerated_set_MimeType(value);
public DefaultHtmlTextExtractor()
public void GetText(Stream doc, Stream text)
DefaultTextExtractor.ExecuteDocumentAction(() =>
RadDocument radDocument = (new HtmlFormatProvider()).Import(doc);
(new TxtFormatProvider()).Export(radDocument, text);
);
public void Initialize(string mimeType, NameValueCollection config)
this.MimeType = "text/html";
private class DefaultPdfTextExtractor : ITextExtractor
private string JustDecompileGenerated_MimeType_k__BackingField; public string JustDecompileGenerated_get_MimeType()
return this.JustDecompileGenerated_MimeType_k__BackingField;
private void JustDecompileGenerated_set_MimeType(string value)
this.JustDecompileGenerated_MimeType_k__BackingField = value;
public string MimeType
get
return JustDecompileGenerated_get_MimeType();
set
JustDecompileGenerated_set_MimeType(value);
public DefaultPdfTextExtractor()
public void GetText(Stream doc, Stream text)
DefaultTextExtractor.ExecuteDocumentAction(() =>
byte[] numArray;
using (BinaryReader binaryReader = new BinaryReader(doc))
numArray = binaryReader.ReadBytes((int)doc.Length);
MemoryStream memoryStream = new MemoryStream();
memoryStream.Write(numArray, 0, (int)numArray.Length);
RadFixedDocument radFixedDocument = (new PdfFormatProvider(memoryStream, FormatProviderSettings.ReadOnDemand)).Import();
TextFormatProvider textFormatProvider = new TextFormatProvider();
TextFormatProviderSettings textFormatProviderSetting = new TextFormatProviderSettings("\r\n", string.Empty);
string str = textFormatProvider.Export(radFixedDocument, textFormatProviderSetting);
StreamWriter streamWriter = new StreamWriter(text, Encoding.UTF8);
streamWriter.Write(str);
streamWriter.Flush();
);
public void Initialize(string mimeType, NameValueCollection config)
this.MimeType = "application/pdf";
private class DefaultRtfTextExtractor : ITextExtractor
private string JustDecompileGenerated_MimeType_k__BackingField; public string JustDecompileGenerated_get_MimeType()
return this.JustDecompileGenerated_MimeType_k__BackingField;
private void JustDecompileGenerated_set_MimeType(string value)
this.JustDecompileGenerated_MimeType_k__BackingField = value;
public string MimeType
get
return JustDecompileGenerated_get_MimeType();
set
JustDecompileGenerated_set_MimeType(value);
public DefaultRtfTextExtractor()
public void GetText(Stream doc, Stream text)
DefaultTextExtractor.ExecuteDocumentAction(() =>
RadDocument radDocument = (new RtfFormatProvider()).Import(doc);
(new TxtFormatProvider()).Export(radDocument, text);
);
public void Initialize(string mimeType, NameValueCollection config)
this.MimeType = "text/rtf";
private class DefaultTxtTextExtractor : ITextExtractor
private string JustDecompileGenerated_MimeType_k__BackingField; public string JustDecompileGenerated_get_MimeType()
return this.JustDecompileGenerated_MimeType_k__BackingField;
private void JustDecompileGenerated_set_MimeType(string value)
this.JustDecompileGenerated_MimeType_k__BackingField = value;
public string MimeType
get
return JustDecompileGenerated_get_MimeType();
set
JustDecompileGenerated_set_MimeType(value);
public DefaultTxtTextExtractor()
public void GetText(Stream doc, Stream text)
DefaultTextExtractor.ExecuteDocumentAction(() =>
RadDocument radDocument = (new TxtFormatProvider()).Import(doc);
(new TxtFormatProvider()).Export(radDocument, text);
);
public void Initialize(string mimeType, NameValueCollection config)
this.MimeType = "text/txt";
Any suggestions on this? I need to specify mime types for SVG images.