Searching/Indexing additional document/mime types

Posted by Community Admin on 04-Aug-2018 15:40

Searching/Indexing additional document/mime types

All Replies

Posted by Community Admin on 31-Mar-2013 00:00

Hi Everyone:

I have been investigating whether SiteFinity can be extended to extract content from additional document/mime types, e.g. power point documents, excel files, etc.  I have decompiled the SiteFinity libraries and I have found the following class that is responsible for extraction of text (see below).  There seems to be also some configuration elements that register the list of available mime types (I cannot seemt to find the actual config file). 

The class itself is internal so I cannot even call it or reference it from my own code, so that's not going to work.  The class is called by the DocumentService which is also internal.

Is there a prescribed method for adding an additional ITextExtractor to be used for pulling out text from files - I could easily write one but I have no way to hook it into the default DocumentService?

Suggestions?

internal class DefaultTextExtractor : ITextExtractor
   
        private static object syncLock;        private string mimeType;        public string MimeType
       
            get
           
                return this.mimeType;
           
                static DefaultTextExtractor()
       
            DefaultTextExtractor.syncLock = new object();
                public DefaultTextExtractor()
       
                private static void ExecuteDocumentAction(Action action)
       
            lock (DefaultTextExtractor.syncLock)
           
                Thread thread = new Thread(() =>
                    try
                   
                        action();
                   
                    catch (Exception exception1)
                   
                        Exception exception = exception1;
                        exception = new Exception("Error extracting the text content of a document", exception);
                        Log.Write(exception, ConfigurationPolicy.ErrorLog);
                   
                );
                thread.SetApartmentState(ApartmentState.STA);
                thread.Start();
                thread.Join();
           
                private ITextExtractor GetInnerTextExtractor()
       
            string str = this.mimeType;
            string str1 = str;
            if (str != null)
           
                if (str1 == "text/rtf")
               
                    return new DefaultTextExtractor.DefaultRtfTextExtractor();
               
                else
               
                    if (str1 == "text/html")
                   
                        return new DefaultTextExtractor.DefaultHtmlTextExtractor();
                   
                    else
                   
                        if (str1 == "text/plain")
                       
                            return new DefaultTextExtractor.DefaultTxtTextExtractor();
                       
                        else
                       
                            if (str1 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
                           
                                return new DefaultTextExtractor.DefaultDocxTextExtractor();
                           
                            else
                           
                                if (str1 == "application/pdf")
                               
                                    return new DefaultTextExtractor.DefaultPdfTextExtractor();
                               
                           
                       
                   
               
           
            return null;
                public void GetText(Stream doc, Stream text)
       
            ITextExtractor innerTextExtractor = this.GetInnerTextExtractor();
            if (innerTextExtractor != null)
           
                innerTextExtractor.GetText(doc, text);
                return;
           
            else
           
                throw new InvalidOperationException(string.Format("The MIME type '0' is not supported by DefaultTextExtractor.", this.mimeType));
           
                public void Initialize(string mimeType, NameValueCollection config)
       
            if (!string.IsNullOrEmpty(mimeType))
           
                this.mimeType = mimeType;
                return;
           
            else
           
                throw new ArgumentException("The default text extractor needs a MIME type.");
           
                private class DefaultDocxTextExtractor : ITextExtractor
       
            private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
           
                return this.JustDecompileGenerated_MimeType_k__BackingField;
                        private void JustDecompileGenerated_set_MimeType(string value)
           
                this.JustDecompileGenerated_MimeType_k__BackingField = value;
                        public string MimeType
           
                get
               
                    return JustDecompileGenerated_get_MimeType();
               
                set
               
                    JustDecompileGenerated_set_MimeType(value);
               
                        public DefaultDocxTextExtractor()
           
                        public void GetText(Stream doc, Stream text)
           
                DefaultTextExtractor.ExecuteDocumentAction(() =>
                    byte[] numArray;
                    using (BinaryReader binaryReader = new BinaryReader(doc))
                   
                        numArray = binaryReader.ReadBytes((int)doc.Length);
                   
                    RadDocument radDocument = (new DocxFormatProvider()).Import(numArray);
                    (new TxtFormatProvider()).Export(radDocument, text);
                );
                        public void Initialize(string mimeType, NameValueCollection config)
           
                this.MimeType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
           
                private class DefaultHtmlTextExtractor : ITextExtractor
       
            private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
           
                return this.JustDecompileGenerated_MimeType_k__BackingField;
                        private void JustDecompileGenerated_set_MimeType(string value)
           
                this.JustDecompileGenerated_MimeType_k__BackingField = value;
                        public string MimeType
           
                get
               
                    return JustDecompileGenerated_get_MimeType();
               
                set
               
                    JustDecompileGenerated_set_MimeType(value);
               
                        public DefaultHtmlTextExtractor()
           
                        public void GetText(Stream doc, Stream text)
           
                DefaultTextExtractor.ExecuteDocumentAction(() =>
                    RadDocument radDocument = (new HtmlFormatProvider()).Import(doc);
                    (new TxtFormatProvider()).Export(radDocument, text);
                );
                        public void Initialize(string mimeType, NameValueCollection config)
           
                this.MimeType = "text/html";
           
                private class DefaultPdfTextExtractor : ITextExtractor
       
            private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
           
                return this.JustDecompileGenerated_MimeType_k__BackingField;
                        private void JustDecompileGenerated_set_MimeType(string value)
           
                this.JustDecompileGenerated_MimeType_k__BackingField = value;
                        public string MimeType
           
                get
               
                    return JustDecompileGenerated_get_MimeType();
               
                set
               
                    JustDecompileGenerated_set_MimeType(value);
               
                        public DefaultPdfTextExtractor()
           
                        public void GetText(Stream doc, Stream text)
           
                DefaultTextExtractor.ExecuteDocumentAction(() =>
                    byte[] numArray;
                    using (BinaryReader binaryReader = new BinaryReader(doc))
                   
                        numArray = binaryReader.ReadBytes((int)doc.Length);
                   
                    MemoryStream memoryStream = new MemoryStream();
                    memoryStream.Write(numArray, 0, (int)numArray.Length);
                    RadFixedDocument radFixedDocument = (new PdfFormatProvider(memoryStream, FormatProviderSettings.ReadOnDemand)).Import();
                    TextFormatProvider textFormatProvider = new TextFormatProvider();
                    TextFormatProviderSettings textFormatProviderSetting = new TextFormatProviderSettings("\r\n", string.Empty);
                    string str = textFormatProvider.Export(radFixedDocument, textFormatProviderSetting);
                    StreamWriter streamWriter = new StreamWriter(text, Encoding.UTF8);
                    streamWriter.Write(str);
                    streamWriter.Flush();
                );
                        public void Initialize(string mimeType, NameValueCollection config)
           
                this.MimeType = "application/pdf";
           
                private class DefaultRtfTextExtractor : ITextExtractor
       
            private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
           
                return this.JustDecompileGenerated_MimeType_k__BackingField;
                        private void JustDecompileGenerated_set_MimeType(string value)
           
                this.JustDecompileGenerated_MimeType_k__BackingField = value;
                        public string MimeType
           
                get
               
                    return JustDecompileGenerated_get_MimeType();
               
                set
               
                    JustDecompileGenerated_set_MimeType(value);
               
                        public DefaultRtfTextExtractor()
           
                        public void GetText(Stream doc, Stream text)
           
                DefaultTextExtractor.ExecuteDocumentAction(() =>
                    RadDocument radDocument = (new RtfFormatProvider()).Import(doc);
                    (new TxtFormatProvider()).Export(radDocument, text);
                );
                        public void Initialize(string mimeType, NameValueCollection config)
           
                this.MimeType = "text/rtf";
           
                private class DefaultTxtTextExtractor : ITextExtractor
       
            private string JustDecompileGenerated_MimeType_k__BackingField;            public string JustDecompileGenerated_get_MimeType()
           
                return this.JustDecompileGenerated_MimeType_k__BackingField;
                        private void JustDecompileGenerated_set_MimeType(string value)
           
                this.JustDecompileGenerated_MimeType_k__BackingField = value;
                        public string MimeType
           
                get
               
                    return JustDecompileGenerated_get_MimeType();
               
                set
               
                    JustDecompileGenerated_set_MimeType(value);
               
                        public DefaultTxtTextExtractor()
           
                        public void GetText(Stream doc, Stream text)
           
                DefaultTextExtractor.ExecuteDocumentAction(() =>
                    RadDocument radDocument = (new TxtFormatProvider()).Import(doc);
                    (new TxtFormatProvider()).Export(radDocument, text);
                );
                        public void Initialize(string mimeType, NameValueCollection config)
           
                this.MimeType = "text/txt";
           
       
   

Posted by Community Admin on 01-Apr-2014 00:00

Any suggestions on this? I need to specify mime types for SVG images.

This thread is closed