You are viewing limited content. For full access, please sign in.

Question

Question

Issue with creating searchable PDF's using the Laserfiche SDK

asked on August 25, 2014 Show version history

I’ve written a windows service that performs the following functions:
• Create a Laserfiche document using a multi-page tiff file 
• OCR the newly created document
• Export the document as a PDF.

This issue with the exported PDF is that when searching for text, the highlighted found text is not near where the text actually is.

I’ve attached a PDF. When searching for text in file 10791553_SDK.pdf, text might be found, but the highlighted text is not accurate.
10791553.tif is the original file imported into LF.

Below is the code I’m using to create searchable PDF’s with the Laserfiche SDK (version 9.0). 

 

public class Document
{
private RepositoryRegistration _repositoryRegistration;
private Session _session;

public void Import(ref ImportItem item)
{
try
{
_repositoryRegistration = new RepositoryRegistration(item.ServerName, item.RepositoryName);
_session = new Session();
_session.LogIn(item.UserName, item.Password, _repositoryRegistration);
DocumentInfo documentInfo;

if (item.DocumentAction == DocumentAction.New || item.DocumentAction == DocumentAction.Overwrite)
{
documentInfo = CreateNewLaserficheDocument(ref item);
AddPagesToLaserficheDocument(documentInfo, ref item);
OcrNewDocument(documentInfo, ref item);
ExportDocument(documentInfo, item.FQPdfName, ref item);
documentInfo.Delete();
documentInfo.Dispose();
}

_session.Save();
_session.LogOut();
}
catch (Exception ex)
{
item.DocumentStatus = DocumentStatus.Errored;
throw ex;
}
}

private void AddPagesToLaserficheDocument(DocumentInfo documentInfo, ref ImportItem item)
{
PageInfo page;
int pageNbr = 0;

try
{
//This is adding individual tiff pages to a newly created LF document
foreach (FileInfo fileInfo in new DirectoryInfo(item.ImagesDirectory).GetFiles().OrderBy(x => x.Name))
{
if (fileInfo.Name != "Thumbs.db")
{
using (FileStream file = File.OpenRead(fileInfo.FullName))
{
documentInfo.AppendPage();
pageNbr++;
page = documentInfo.GetPageInfo((pageNbr));
using (Stream writer = page.WritePagePart(PagePart.Image, (int)file.Length))
{
byte[] buffer = new byte[0x8000];
int count = 0;
while ((count = file.Read(buffer, 0, buffer.Length)) > 0)
writer.Write(buffer, 0, count);
}
}
}
}
}
catch (Exception ex)
{
item.DocumentStatus = DocumentStatus.ErrorAddPagesToDoc;
throw ex;
}
}

private DocumentInfo CreateNewLaserficheDocument(ref ImportItem item)
{
DocumentInfo documentInfo;

try
{
documentInfo = new DocumentInfo(_session);
FolderInfo rootFolderInfo = Folder.GetFolderInfo(item.ImportFolder, _session);
documentInfo.Create(rootFolderInfo, item.FileInfo.Name.Substring(0, item.FileInfo.Name.Length - 4), item.Volume, EntryNameOption.AutoRename);
}
catch (Exception ex)
{
item.DocumentStatus = DocumentStatus.ErrorCreateLfDoc;
throw ex;
}

return documentInfo;
}

private void OcrNewDocument(DocumentInfo docInfo, ref ImportItem item)
{
// initialize an instance of OcrEngine
bool ocrEngAvail = OcrEngine.IsOcrEngineAvailable();

try
{
using (OcrEngine ocrEngine = OcrEngine.LoadEngine())
{
ocrEngine.AutoOrient = true;
ocrEngine.Decolumnize = true;
ocrEngine.OptimizationMode = OcrOptimizationMode.Accuracy;
for (int i = 1; i <= docInfo.PageCount; i++)
{
PageInfo pi = docInfo.GetPageInfo(i);
if (pi.HasImage)
{
if (!pi.HasText)
{
PageSet ps = new PageSet();
ps.AddPage(i);
try
{
ocrEngine.Run(docInfo, ps);
}
catch (Exception ex)
{

}
}
}
}

}
}
catch (Exception ex)
{
item.DocumentStatus = DocumentStatus.ErrorOcrDoc;
throw;
}

docInfo.Index();

}

public void ExportDocument(DocumentInfo documentInfo, string exportLocation, ref ImportItem item)
{
try
{
DocumentExporter documentExporter = new DocumentExporter();
IDocumentContents documentContents = documentInfo;
PageSet pageSet;

DocumentStatistics statistics = documentInfo.GetStatistics();
pageSet = new PageSet("1-" + statistics.PageCount);
documentExporter.CompressionQuality = 90;

//documentExporter.ExportPages(documentContents, pageSet, exportLocation);
documentExporter.ExportPdf(documentContents, pageSet, PdfExportOptions.IncludeText, exportLocation);

//documentExporter.ExportElecDoc(documentContents, exportLocation);

item.DocumentStatus = DocumentStatus.CreatedAndExported;
}
catch (Exception ex)
{
item.DocumentStatus = DocumentStatus.ErrorExportDoc;
throw;
}
}
}

 

0 0

Answer

SELECTED ANSWER
replied on August 25, 2014

I can reproduce the problem, it is a bug in the pdf export component. I have filed a bug report and this should be fixed in a future version.

0 0

Replies

replied on August 25, 2014

There doesn't seem to be a PDF attached to this question.

1 0
replied on August 25, 2014

I attached the PDF.

0 0
replied on August 25, 2014

I tried your code on a sample image and the resulting text locations are correct. Can you also attach the source images?

0 0
replied on August 25, 2014

Sorry, I gave you the wrong image.

I will get you the TIFF from Laserfiche.

0 0
replied on August 25, 2014

Here is the source image.

0 0
replied on August 25, 2014

Here you go.

10791553.tiff (145.68 KB)
0 0
replied on August 26, 2014

I am wondering what the ETA will be.

0 0
replied on August 26, 2014

The defect has been identified and resolved and DocumentServices 9.2 will contain the fix. If you need a fix for DocumentServices 9.1 you can contact Laserfiche technical support and request a hotfix, referencing SCR 118579.

0 0
You are not allowed to follow up in this post.

Sign in to reply to this post.