Search string in PDF return page number

masterc

New Member
Joined
Feb 12, 2019
Messages
5


Dear all,





Please excuse me as I am new to this forum however here ismy question.




I have a excel file with strings in column A.

A button on that sheet that is linked to the code below.

What it should do is find each string in a pdf file I selectafter using the button.



I don’t see the pdf file when the ribbon open file menu opens.

I am using excel 2016 and adobe reader dc.

I am no expert in vba,found this code

Sub BatchRenameCS()

Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim newPDF As Acrobat.CAcroPDDoc
Dim lastrow2 As Long
Dim strFileName As String
Dim Folder As String
Dim Page As Long
Dim Cell As Long
Dim PDFCharacterCount() As Long
Dim CharacterCount As Long
Dim i As Integer
Dim c As Integer
Dim x As Integer
Dim strSource As String
Dim strResult As String
Dim PDFCharacters As String
Dim PDFCharacters2 As String
Dim PDFPasteData() As String
Dim PasteDataPage As Integer
Dim LastRow As Long
Dim NewName As String
Dim NewNamePageNum As Integer
Dim Check()
Sheets("Sheet1").Range("C:D").ClearContents
strFileName = selectFile()
Folder = GetFolder()
'create array with pdf word count
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
If objPDDoc.Open(strFileName) Then
Set objjso = objPDDoc.GetJSObject
ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long
For Page = 1 To objPDDoc.GetNumPages
PDFCharacters = ""
PDFCharacters2 = ""
For c = 0 To objjso.GetPageNumWords(Page - 1)
PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
Next c
For i = 1 To Len(PDFCharacters)
Select Case Asc(Mid(PDFCharacters, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
Case Else
PDFCharacters2 = PDFCharacters2 & ""
End Select
Next
PDFCharacterCount(Page) = Len(PDFCharacters2)
Next Page
lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
Page = 1
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
For Cell = 1 To lastrow2
strResult = ""
strSource = Sheets("Sheet2").Cells(Cell, 1).Text
PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
For i = 1 To Len(strSource)
Select Case Asc(Mid(strSource, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
strResult = strResult & (Mid(strSource, i, 1))
Case Else
strResult = strResult & ""
End Select
Next
CharacterCount = CharacterCount + Len(strResult)
If CharacterCount = PDFCharacterCount(Page) Then
CharacterCount = 0
Page = Page + 1
End If
Next Cell
ReDim Check(2, objPDDoc.GetNumPages)
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
For PasteDataPage = 1 To objPDDoc.GetNumPages
If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
If FileExist(Folder & "" & LookUpCell.Offset(0, 1) & ".pdf") Then
Set newPDF = CreateObject("AcroExch.pdDoc")
NewName = Folder & "" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.Open (NewName)
newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Else
Set newPDF = CreateObject("AcroExch.PDDoc")
newPDF.Create
NewName = Folder & "" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
End If
End If
Next PasteDataPage
Next LookUpCell
x = 1
For PasteDataPage = 1 To objPDDoc.GetNumPages
If Check(1, PasteDataPage) <> 1 Then
Sheets("Sheet1").Cells(x, 3) = PasteDataPage
Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
x = x + 1
End If
Next PasteDataPage
End If
MsgBox "Done"
End Sub
Function FileExist(path As String) As Boolean
If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
If fd.SelectedItems(1) <> vbNullString Then
fileName = fd.SelectedItems(1)
End If
Else
'Exit code if no file is selected
End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
Dim fldr As FileDialog
Dim sItem As String
Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
With fldr
.Title = "Select the Folder where you want you new PDFs to go"
.AllowMultiSelect = False
.InitialFileName = Application.DefaultFilePath
If .Show <> -1 Then GoTo NextCode
sItem = .SelectedItems(1)
End With
NextCode:
GetFolder = sItem
Set fldr = Nothing
End Function

 
Thanks for the workbook and PDF. I've found the issue - it's because 'words' like "A3.1.3(a)" are seen by the Acrobat GetText function as 2 separate words, "A3.1.3(" and "a)CRLF". You can see this on the following debug output:

756 >match <
757 >the <
758 >new <
759 >theme.CRLF<
760 >A3.1.1CRLF<
761 >A3.1.2CRLF<
762 >A3.1.3(<
763 >a)CRLF<
764 >A3.1.3(<
765 >b)CRLF<
766 >A3.1.3(<

which is produced by the following code changes, looking at only page 65:

VBA Code:
    For p = 64 To 64 '0 To AcroPDDoc.GetNumPages - 1


                PageText = " "
                For i = 0 To TextSelect.GetNumText - 1
                    Debug.Print i; ">" & Replace(TextSelect.GetText(i), vbCrLf, "CRLF") & "<"
                    If InStr(TextSelect.GetText(i), "A3.1") Then Stop
                    PageText = PageText & Replace(TextSelect.GetText(i), vbCrLf, " ")
                Next
                PageText = PageText & " "
                Debug.Print PageText
                Stop
756, 757 etc. is the index number of each word on the page. Notice that every word ends with either a space or CRLF characters (which indicates end of paragraph). Therefore if we replace every CRLF with a space then the separate words like "A3.1.3(" and "a)CRLF" will be joined together as "A3.1.3(a) " in the PageText string and we can search for " A3.1.3(a) " (delimited by a space) to find it.

Here is the complete code, which also formats column B as Text at the end.

VBA Code:
Public Sub Search_Strings_in_PDF_Pages()

    Dim AcroApp As CAcroApp
    Dim AcroPDDoc As CAcroPDDoc
    Dim AcroHiliteList As CAcroHiliteList
    Dim TextSelect As CAcroPDTextSelect
    Dim Page As CAcroPDPage
    Dim PageContent As CAcroHiliteList
    Dim PDFfile As String
    Dim searchStringCells As Range, searchString As Range
    Dim p As Long, i As Long
    Dim PageText As String
    Dim foundPageNumbers As String
    
    'Search selected cells
    'Set searchStringCells = Selection
    
    'Or column A cells
    With ActiveSheet
        Set searchStringCells = .Range("A2", .Cells(.Rows.Count, "A").End(xlUp))
    End With
    
    PDFfile = Application.GetOpenFilename("PDF Files (*.pdf), *.pdf", Title:="Select the PDF file to search")
    If PDFfile = "False" Then Exit Sub
        
    Set AcroApp = CreateObject("AcroExch.App")
    Set AcroPDDoc = CreateObject("AcroExch.PDDoc")
    
    'Open the PDF
    
    If Not AcroPDDoc.Open(PDFfile) Then
        MsgBox PDFfile & " couldn't be opened - macro exiting"
        Exit Sub
    End If
    
    searchStringCells.Offset(, 1).Cells.Clear
    
    'Read each page
    
    For p = 0 To AcroPDDoc.GetNumPages - 1
    
        Set Page = AcroPDDoc.AcquirePage(p)
        Set PageContent = CreateObject("AcroExch.HiliteList")
    
        'Get up to 9,999 words from this page
        
        If PageContent.Add(0, 9999) Then
           
            Set TextSelect = Page.CreatePageHilite(PageContent)
       
            If Not TextSelect Is Nothing Then
            
                'Extract all page content into a text string with each word delimited by ' '
                
                PageText = " "
                For i = 0 To TextSelect.GetNumText - 1
                    'Debug.Print i; ">" & Replace(TextSelect.GetText(i), vbCrLf, "CRLF") & "<"
                    'If InStr(TextSelect.GetText(i), "A3.1") Then Stop
                    PageText = PageText & Replace(TextSelect.GetText(i), vbCrLf, " ")
                Next
                PageText = PageText & " "
                'Debug.Print PageText
                'Stop
                
                'Search for each cell string in this page
                
                For Each searchString In searchStringCells
                    foundPageNumbers = searchString.Offset(, 1).Value
                    If InStr(1, PageText, " " & searchString.Value & " ", vbTextCompare) > 0 Then
                        If foundPageNumbers = "" Then
                            foundPageNumbers = p + 1
                        Else
                            foundPageNumbers = foundPageNumbers & ", " & p + 1
                        End If
                        searchString.Offset(, 1).Value = foundPageNumbers
                    End If
                Next
                
            End If
            
        Else
    
            MsgBox "PageContent.Add error on page " & p + 1 & " - page not searched"
   
        End If
    
    Next
    
    AcroPDDoc.Close
    AcroApp.Exit

    With ActiveSheet
        .Columns("B:B").NumberFormat = "@" 'Text format
    End With

    MsgBox "Finished"
    
End Sub
Result of first few rows:
Test Checklister.xlsm
AB
1Annex RefPage Numbers
2A3.1.165
3A3.1.265
4A3.1.3(a)65
5A3.1.3(b)4, 11, 65
6A3.1.3(c)65
7A3.1.3(d)65
8A3.1.437, 65
9A3.1.565
10A3.2.14, 11, 65
11A3.3.114, 65
12A3.4.165
13A3.4.265
14A3.5.165
15A3.5.1(a)37, 62, 65
16A3.5.1(b)65
17A3.5.1(b)(i)65
18A3.5.1(b)(ii)65
19A3.5.1(b)(iii)65
20A3.5.265
Sheet1
 
Upvote 0

Excel Facts

Pivot Table Drill Down
Double-click any number in a pivot table to create a new report showing all detail rows that make up that number
Thank you so much! Will take a look. Using it on the actual PDF I need to there are still a few missing ones but think I now need to take it away and debug to see if I can solve the final pieces given everything you've already provided.
 
Upvote 0
@John_w - Thanks again for all of your help last week. The random misses are perplexing as the script will pick up 16 instances of A3.5.1(a) but miss a single one and misses A3.3.1 in the one area in which it appears on pg 14 of the real document. Would you be able to help me add a block that will print the output of the page inspections from specified pages so that I can see what it is viewing those strings to be? I presume that is why they are not being picked up. Thanks again, I owe you a paypal beer (at least!)
 
Upvote 0
the script will pick up 16 instances of A3.5.1(a) but miss a single one and misses A3.3.1 in the one area in which it appears on pg 14 of the real document. Would you be able to help me add a block that will print the output of the page inspections from specified pages so that I can see what it is viewing those strings to be?
You could use similar debugging code that I used in post #21, but with For p = 13 To 13 for page 14.
 
Upvote 0
Have tried and in my stunning incompetance am somehow outputting the last page to the immediate window! Will try again!
 
Upvote 0

Forum statistics

Threads
1,224,827
Messages
6,181,198
Members
453,022
Latest member
RobertV1609

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top