[c#] Reading PDF documents in .Net

Is there an open source library that will help me with reading/parsing PDF documents in .Net/C#?

This question is related to c# .net pdf

The answer is


Since this question was last answered in 2008, iTextSharp has improved their api dramatically. If you download the latest version of their api from http://sourceforge.net/projects/itextsharp/, you can use the following snippet of code to extract all text from a pdf into a string.

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

namespace PdfParser
{
    public static class PdfTextExtractor
    {
        public static string pdfText(string path)
        {
            PdfReader reader = new PdfReader(path);
            string text = string.Empty;
            for(int page = 1; page <= reader.NumberOfPages; page++)
            {
                text += PdfTextExtractor.GetTextFromPage(reader,page);
            }
            reader.Close();
            return text;
        }   
    }
}

iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.

using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;

namespace Spider.Utils
{
    /// <summary>
    /// Parses a PDF file and extracts the text from it.
    /// </summary>
    public class PDFParser
    {
        /// BT = Beginning of a text object operator 
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

        #region ExtractText
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public bool ExtractText(string inFileName, string outFileName)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

                Console.Write("Processing: ");

                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                int totalWritten = 0;
                float curUnit = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Console.Write("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Console.Write("#");
                    }
                }
                return true;
            }
            catch
            {
                return false;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
        #endregion

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object 
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal 
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];
                    if (input[i] == 213)
                        c = "'".ToCharArray()[0];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what. 
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            resultString += c.ToString();
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for 
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }

                return CleanupContent(resultString);
            }
            catch
            {
                return "";
            }
        }

        private string CleanupContent(string text)
        {
            string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
            string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };

            for (int i = 0; i < patterns.Length; i++)
            {
                string regExPattern = patterns[i];
                Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
                text = regex.Replace(text, replace[i]);
            }

            return text;
        }

        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="tokens">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                    (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                    ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                    ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0a))
                    )
                {
                    return true;
                }
            }
            return false;
        }
        #endregion
    }
}

iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.

using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;

namespace Spider.Utils
{
    /// <summary>
    /// Parses a PDF file and extracts the text from it.
    /// </summary>
    public class PDFParser
    {
        /// BT = Beginning of a text object operator 
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

        #region ExtractText
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public bool ExtractText(string inFileName, string outFileName)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

                Console.Write("Processing: ");

                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                int totalWritten = 0;
                float curUnit = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Console.Write("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Console.Write("#");
                    }
                }
                return true;
            }
            catch
            {
                return false;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
        #endregion

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object 
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal 
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];
                    if (input[i] == 213)
                        c = "'".ToCharArray()[0];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what. 
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            resultString += c.ToString();
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for 
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }

                return CleanupContent(resultString);
            }
            catch
            {
                return "";
            }
        }

        private string CleanupContent(string text)
        {
            string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
            string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };

            for (int i = 0; i < patterns.Length; i++)
            {
                string regExPattern = patterns[i];
                Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
                text = regex.Replace(text, replace[i]);
            }

            return text;
        }

        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="tokens">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                    (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                    ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                    ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0a))
                    )
                {
                    return true;
                }
            }
            return false;
        }
        #endregion
    }
}

iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.

using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;

namespace Spider.Utils
{
    /// <summary>
    /// Parses a PDF file and extracts the text from it.
    /// </summary>
    public class PDFParser
    {
        /// BT = Beginning of a text object operator 
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

        #region ExtractText
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public bool ExtractText(string inFileName, string outFileName)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

                Console.Write("Processing: ");

                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                int totalWritten = 0;
                float curUnit = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Console.Write("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Console.Write("#");
                    }
                }
                return true;
            }
            catch
            {
                return false;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
        #endregion

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object 
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal 
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];
                    if (input[i] == 213)
                        c = "'".ToCharArray()[0];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what. 
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            resultString += c.ToString();
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for 
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }

                return CleanupContent(resultString);
            }
            catch
            {
                return "";
            }
        }

        private string CleanupContent(string text)
        {
            string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
            string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };

            for (int i = 0; i < patterns.Length; i++)
            {
                string regExPattern = patterns[i];
                Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
                text = regex.Replace(text, replace[i]);
            }

            return text;
        }

        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="tokens">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                    (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                    ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                    ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0a))
                    )
                {
                    return true;
                }
            }
            return false;
        }
        #endregion
    }
}

iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.

using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;

namespace Spider.Utils
{
    /// <summary>
    /// Parses a PDF file and extracts the text from it.
    /// </summary>
    public class PDFParser
    {
        /// BT = Beginning of a text object operator 
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

        #region ExtractText
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public bool ExtractText(string inFileName, string outFileName)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

                Console.Write("Processing: ");

                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                int totalWritten = 0;
                float curUnit = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Console.Write("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Console.Write("#");
                    }
                }
                return true;
            }
            catch
            {
                return false;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
        #endregion

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object 
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal 
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];
                    if (input[i] == 213)
                        c = "'".ToCharArray()[0];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what. 
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            resultString += c.ToString();
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for 
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }

                return CleanupContent(resultString);
            }
            catch
            {
                return "";
            }
        }

        private string CleanupContent(string text)
        {
            string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
            string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };

            for (int i = 0; i < patterns.Length; i++)
            {
                string regExPattern = patterns[i];
                Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
                text = regex.Replace(text, replace[i]);
            }

            return text;
        }

        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="tokens">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                    (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                    ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                    ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0a))
                    )
                {
                    return true;
                }
            }
            return false;
        }
        #endregion
    }
}

PDFClown might help, but I would not recommend it for a big or heavy use application.


PDFClown might help, but I would not recommend it for a big or heavy use application.


public string ReadPdfFile(object Filename, DataTable ReadLibray)
{
    PdfReader reader2 = new PdfReader((string)Filename);
    string strText = string.Empty;

    for (int page = 1; page <= reader2.NumberOfPages; page++)
    {
    ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
    PdfReader reader = new PdfReader((string)Filename);
    String s = PdfTextExtractor.GetTextFromPage(reader, page, its);

    s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
    strText = strText + s;
    reader.Close();
    }
    return strText;
}

public string ReadPdfFile(object Filename, DataTable ReadLibray)
{
    PdfReader reader2 = new PdfReader((string)Filename);
    string strText = string.Empty;

    for (int page = 1; page <= reader2.NumberOfPages; page++)
    {
    ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
    PdfReader reader = new PdfReader((string)Filename);
    String s = PdfTextExtractor.GetTextFromPage(reader, page, its);

    s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
    strText = strText + s;
    reader.Close();
    }
    return strText;
}

PDFClown might help, but I would not recommend it for a big or heavy use application.


PDFClown might help, but I would not recommend it for a big or heavy use application.


iText is the best library I know. Originally written in Java, there is a .NET port as well.

See http://www.ujihara.jp/iTextdotNET/en/


iText is the best library I know. Originally written in Java, there is a .NET port as well.

See http://www.ujihara.jp/iTextdotNET/en/


iText is the best library I know. Originally written in Java, there is a .NET port as well.

See http://www.ujihara.jp/iTextdotNET/en/


iText is the best library I know. Originally written in Java, there is a .NET port as well.

See http://www.ujihara.jp/iTextdotNET/en/


You could look into this: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx It's not completely free, but it looks very nice.

Alex


You could look into this: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx It's not completely free, but it looks very nice.

Alex


aspose pdf works pretty well. then again, you have to pay for it


aspose pdf works pretty well. then again, you have to pay for it




You could look into this: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx It's not completely free, but it looks very nice.

Alex


You could look into this: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx It's not completely free, but it looks very nice.

Alex




aspose pdf works pretty well. then again, you have to pay for it


aspose pdf works pretty well. then again, you have to pay for it




There is also LibHaru

http://libharu.org/wiki/Main_Page


There is also LibHaru

http://libharu.org/wiki/Main_Page


There is also LibHaru

http://libharu.org/wiki/Main_Page


There is also LibHaru

http://libharu.org/wiki/Main_Page


Have a look at Docotic.Pdf library. It does not require you to make source code of your application open (like iTextSharp with viral AGPL 3 license, for example).

Docotic.Pdf can be used to read PDF files and extract text with or without formatting. Please have a look at the article that shows how to extract text from PDFs.

Disclaimer: I work for Bit Miracle, vendor of the library.


Have a look at Docotic.Pdf library. It does not require you to make source code of your application open (like iTextSharp with viral AGPL 3 license, for example).

Docotic.Pdf can be used to read PDF files and extract text with or without formatting. Please have a look at the article that shows how to extract text from PDFs.

Disclaimer: I work for Bit Miracle, vendor of the library.


Questions with c# tag:

How can I convert this one line of ActionScript to C#? Microsoft Advertising SDK doesn't deliverer ads How to use a global array in C#? How to correctly write async method? C# - insert values from file into two arrays Uploading into folder in FTP? Are these methods thread safe? dotnet ef not found in .NET Core 3 HTTP Error 500.30 - ANCM In-Process Start Failure Best way to "push" into C# array How can I add raw data body to an axios request? Couldn't process file resx due to its being in the Internet or Restricted zone or having the mark of the web on the file Convert string to boolean in C# Entity Framework Core: A second operation started on this context before a previous operation completed ASP.NET Core - Swashbuckle not creating swagger.json file Is ConfigurationManager.AppSettings available in .NET Core 2.0? No authenticationScheme was specified, and there was no DefaultChallengeScheme found with default authentification and custom authorization Getting value from appsettings.json in .net core .net Core 2.0 - Package was restored using .NetFramework 4.6.1 instead of target framework .netCore 2.0. The package may not be fully compatible Automatically set appsettings.json for dev and release environments in asp.net core? How to use log4net in Asp.net core 2.0 Get ConnectionString from appsettings.json instead of being hardcoded in .NET Core 2.0 App Unable to create migrations after upgrading to ASP.NET Core 2.0 Update .NET web service to use TLS 1.2 Using app.config in .Net Core How to send json data in POST request using C# ASP.NET Core form POST results in a HTTP 415 Unsupported Media Type response How to enable CORS in ASP.net Core WebAPI VS 2017 Metadata file '.dll could not be found How to set combobox default value? How to get root directory of project in asp.net core. Directory.GetCurrentDirectory() doesn't seem to work correctly on a mac ALTER TABLE DROP COLUMN failed because one or more objects access this column Error: the entity type requires a primary key How to POST using HTTPclient content type = application/x-www-form-urlencoded CORS: credentials mode is 'include' Visual Studio 2017: Display method references Where is NuGet.Config file located in Visual Studio project? Unity Scripts edited in Visual studio don't provide autocomplete How to create roles in ASP.NET Core and assign them to users? Return file in ASP.Net Core Web API ASP.NET Core return JSON with status code auto create database in Entity Framework Core Class Diagrams in VS 2017 How to read/write files in .Net Core? How to read values from the querystring with ASP.NET Core? how to set ASPNETCORE_ENVIRONMENT to be considered for publishing an asp.net core application? ASP.NET Core Get Json Array using IConfiguration Entity Framework Core add unique constraint code-first No templates in Visual Studio 2017 ps1 cannot be loaded because running scripts is disabled on this system

Questions with .net tag:

You must add a reference to assembly 'netstandard, Version=2.0.0.0 How to use Bootstrap 4 in ASP.NET Core No authenticationScheme was specified, and there was no DefaultChallengeScheme found with default authentification and custom authorization .net Core 2.0 - Package was restored using .NetFramework 4.6.1 instead of target framework .netCore 2.0. The package may not be fully compatible Update .NET web service to use TLS 1.2 EF Core add-migration Build Failed What is the difference between .NET Core and .NET Standard Class Library project types? Visual Studio 2017 - Could not load file or assembly 'System.Runtime, Version=4.1.0.0' or one of its dependencies Nuget connection attempt failed "Unable to load the service index for source" Token based authentication in Web API without any user interface How to decode JWT Token? What's the difference between .NET Core, .NET Framework, and Xamarin? .NET Core vs Mono How to specify the port an ASP.NET Core application is hosted on? WebForms UnobtrusiveValidationMode requires a ScriptResourceMapping for jquery Why is the Visual Studio 2015/2017/2019 Test Runner not discovering my xUnit v2 tests Build error, This project references NuGet IIS Config Error - This configuration section cannot be used at this path There is no argument given that corresponds to the required formal parameter - .NET Error Could not find a part of the path ... bin\roslyn\csc.exe Can you install and run apps built on the .NET framework on a Mac? What is the purpose of nameof? Microsoft.ReportViewer.Common Version=12.0.0.0 What is the default value for Guid? Authentication failed because remote party has closed the transport stream What is the difference between a Shared Project and a Class Library in Visual Studio 2015? How to break out of the IF statement No connection could be made because the target machine actively refused it 127.0.0.1 Default SecurityProtocol in .NET 4.5 How to properly make a http web GET request What is ".NET Core"? IOException: The process cannot access the file 'file path' because it is being used by another process System.web.mvc missing Disable SSL fallback and use only TLS for outbound connections in .NET? (Poodle mitigation) SQL Connection Error: System.Data.SqlClient.SqlException (0x80131904) Write to Windows Application Event Log Operator overloading ==, !=, Equals System.Net.WebException: The remote name could not be resolved: Running multiple async tasks and waiting for them all to complete What is an "index out of range" exception, and how do I fix it? Found conflicts between different versions of the same dependent assembly that could not be resolved Getting the first and last day of a month, using a given DateTime object Error 1053 the service did not respond to the start or control request in a timely fashion BadImageFormatException. This will occur when running in 64 bit mode with the 32 bit Oracle client components installed Download files from SFTP with SSH.NET library An object reference is required to access a non-static member AppendChild() is not a function javascript Send JSON via POST in C# and Receive the JSON returned? Why is HttpClient BaseAddress not working? KERNELBASE.dll Exception 0xe0434352 offset 0x000000000000a49d

Questions with pdf tag:

ImageMagick security policy 'PDF' blocking conversion How to extract table as text from the PDF using Python? Extract a page from a pdf as a jpeg How can I read pdf in python? Generating a PDF file from React Components Extract Data from PDF and Add to Worksheet How to extract text from a PDF file? How to download PDF automatically using js? Download pdf file using jquery ajax Generate PDF from HTML using pdfMake in Angularjs Generate PDF from Swagger API documentation IPython/Jupyter Problems saving notebook as PDF Extract / Identify Tables from PDF python Create PDF from a list of images VBA Print to PDF and Save with Automatic File Name android download pdf from url then open it with a pdf reader How to convert PDF files to images How to convert webpage into PDF by using Python Window.Open with PDF stream instead of PDF location PDF Blob - Pop up window not showing content How to Display blob (.pdf) in an AngularJS app Excel VBA to Export Selected Sheets to PDF Zoom to fit: PDF Embedded in HTML correct PHP headers for pdf file download Codeigniter how to create PDF HTML embedded PDF iframe Open a selected file (image, pdf, ...) programmatically from my Android Application? Open a PDF using VBA in Excel Printing PDFs from Windows Command Line How to get raw text from pdf file using java How to display pdf in php How to display PDF file in HTML? Android open pdf file How to read pdf file and write it to outputStream How to embed PDF file with responsive width Print PDF directly from JavaScript How to embed a PDF? Save multiple sheets to .pdf How to embed a PDF viewer in a page? Duplicate headers received from server How to display a pdf in a modal window? How to open a PDF file in an <iframe>? How to build PDF file from binary string returned from a web-service using javascript PHP mPDF save file as PDF Pdf.js: rendering a pdf file using a base64 file source instead of url Upload DOC or PDF using PHP Save base64 string as PDF at client side with JavaScript Convert PDF to clean SVG? Display PDF file inside my android application How to Use pdf.js