I’ve written a windows service that performs the following functions:
• Create a Laserfiche document using a multi-page tiff file
• OCR the newly created document
• Export the document as a PDF.
This issue with the exported PDF is that when searching for text, the highlighted found text is not near where the text actually is.
I’ve attached a PDF. When searching for text in file 10791553_SDK.pdf, text might be found, but the highlighted text is not accurate.
10791553.tif is the original file imported into LF.
Below is the code I’m using to create searchable PDF’s with the Laserfiche SDK (version 9.0).
public class Document { private RepositoryRegistration _repositoryRegistration; private Session _session; public void Import(ref ImportItem item) { try { _repositoryRegistration = new RepositoryRegistration(item.ServerName, item.RepositoryName); _session = new Session(); _session.LogIn(item.UserName, item.Password, _repositoryRegistration); DocumentInfo documentInfo; if (item.DocumentAction == DocumentAction.New || item.DocumentAction == DocumentAction.Overwrite) { documentInfo = CreateNewLaserficheDocument(ref item); AddPagesToLaserficheDocument(documentInfo, ref item); OcrNewDocument(documentInfo, ref item); ExportDocument(documentInfo, item.FQPdfName, ref item); documentInfo.Delete(); documentInfo.Dispose(); } _session.Save(); _session.LogOut(); } catch (Exception ex) { item.DocumentStatus = DocumentStatus.Errored; throw ex; } } private void AddPagesToLaserficheDocument(DocumentInfo documentInfo, ref ImportItem item) { PageInfo page; int pageNbr = 0; try { //This is adding individual tiff pages to a newly created LF document foreach (FileInfo fileInfo in new DirectoryInfo(item.ImagesDirectory).GetFiles().OrderBy(x => x.Name)) { if (fileInfo.Name != "Thumbs.db") { using (FileStream file = File.OpenRead(fileInfo.FullName)) { documentInfo.AppendPage(); pageNbr++; page = documentInfo.GetPageInfo((pageNbr)); using (Stream writer = page.WritePagePart(PagePart.Image, (int)file.Length)) { byte[] buffer = new byte[0x8000]; int count = 0; while ((count = file.Read(buffer, 0, buffer.Length)) > 0) writer.Write(buffer, 0, count); } } } } } catch (Exception ex) { item.DocumentStatus = DocumentStatus.ErrorAddPagesToDoc; throw ex; } } private DocumentInfo CreateNewLaserficheDocument(ref ImportItem item) { DocumentInfo documentInfo; try { documentInfo = new DocumentInfo(_session); FolderInfo rootFolderInfo = Folder.GetFolderInfo(item.ImportFolder, _session); documentInfo.Create(rootFolderInfo, item.FileInfo.Name.Substring(0, item.FileInfo.Name.Length - 4), item.Volume, EntryNameOption.AutoRename); } catch (Exception ex) { item.DocumentStatus = DocumentStatus.ErrorCreateLfDoc; throw ex; } return documentInfo; } private void OcrNewDocument(DocumentInfo docInfo, ref ImportItem item) { // initialize an instance of OcrEngine bool ocrEngAvail = OcrEngine.IsOcrEngineAvailable(); try { using (OcrEngine ocrEngine = OcrEngine.LoadEngine()) { ocrEngine.AutoOrient = true; ocrEngine.Decolumnize = true; ocrEngine.OptimizationMode = OcrOptimizationMode.Accuracy; for (int i = 1; i <= docInfo.PageCount; i++) { PageInfo pi = docInfo.GetPageInfo(i); if (pi.HasImage) { if (!pi.HasText) { PageSet ps = new PageSet(); ps.AddPage(i); try { ocrEngine.Run(docInfo, ps); } catch (Exception ex) { } } } } } } catch (Exception ex) { item.DocumentStatus = DocumentStatus.ErrorOcrDoc; throw; } docInfo.Index(); } public void ExportDocument(DocumentInfo documentInfo, string exportLocation, ref ImportItem item) { try { DocumentExporter documentExporter = new DocumentExporter(); IDocumentContents documentContents = documentInfo; PageSet pageSet; DocumentStatistics statistics = documentInfo.GetStatistics(); pageSet = new PageSet("1-" + statistics.PageCount); documentExporter.CompressionQuality = 90; //documentExporter.ExportPages(documentContents, pageSet, exportLocation); documentExporter.ExportPdf(documentContents, pageSet, PdfExportOptions.IncludeText, exportLocation); //documentExporter.ExportElecDoc(documentContents, exportLocation); item.DocumentStatus = DocumentStatus.CreatedAndExported; } catch (Exception ex) { item.DocumentStatus = DocumentStatus.ErrorExportDoc; throw; } } }