Here an improved answer of ShravankumarKumar. I created special classes for the pages so you can access words in the pdf based on the text rows and the word in that row.
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
//create a list of pdf pages
var pages = new List<PdfPage>();
//load the pdf into the reader. NOTE: path can also be replaced with a byte array
using (PdfReader reader = new PdfReader(path))
{
//loop all the pages and extract the text
for (int i = 1; i <= reader.NumberOfPages; i++)
{
pages.Add(new PdfPage()
{
content = PdfTextExtractor.GetTextFromPage(reader, i)
});
}
}
//use linq to create the rows and words by splitting on newline and space
pages.ForEach(x => x.rows = x.content.Split('\n').Select(y =>
new PdfRow() {
content = y,
words = y.Split(' ').ToList()
}
).ToList());
The custom classes
class PdfPage
{
public string content { get; set; }
public List<PdfRow> rows { get; set; }
}
class PdfRow
{
public string content { get; set; }
public List<string> words { get; set; }
}
Now you can get a word by row and word index.
string myWord = pages[0].rows[12].words[4];
Or use Linq to find the rows containing a specific word.
//find the rows in a specific page containing a word
var myRows = pages[0].rows.Where(x => x.words.Any(y => y == "myWord1")).ToList();
//find the rows in all pages containing a word
var myRows = pages.SelectMany(r => r.rows).Where(x => x.words.Any(y => y == "myWord2")).ToList();