How to Parsing / Reading PDF file and convert it into text format using programatically asp.net C# ?
The following code will demonstrate how to reading/parsing the pdf file and convert the same into text / string formate using string builder and PDFBo.
Steps:
1. Download the following file and put those files it into bin folder.
- FontBox-0.1.0-dev.dll
- IKVM.GNU.Classpath.dll
- IKVM.Runtime.dll
- PDFBox-0.7.3.dll
Add this assembly files as reference using addreferrence to the working web application director.
2. Add the following namespaces then
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System.IO;
using org.pdfbox.util;
using System.IO;
3. Create the new plain empty text file with the filename of "textfilename" where the converted pdf file would be stored. Add your pdf & text file in your project itself.
Sample full code demo:
using System;
using System.Collections;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System.IO;
namespace MyWebApps.pdf
{
public partial class pdfread : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
using System.Collections;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System.IO;
namespace MyWebApps.pdf
{
public partial class pdfread : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
parsePDF(Server.MapPath("pdffilename.pdf"), Server.MapPath("textfilename.txt"));
}
public void parsePDF(string pdf_in, string txt_out)
{
StreamWriter sw = new StreamWriter(txt_out, false);
try
{
sw.WriteLine();
sw.WriteLine(DateTime.Now.ToString());
PDDocument doc = PDDocument.load(pdf_in);
PDFTextStripper stripper = new PDFTextStripper();
sw.Write(stripper.getText(doc));
}
catch (Exception ex) { Response.Write(ex.Message); }
finally
{
sw.Close();
sw.Dispose();
}
}
}
}
No comments:
Post a Comment