Search string in PDF return page number

masterc

New Member
Joined
Feb 12, 2019
Messages
5


Dear all,





Please excuse me as I am new to this forum however here ismy question.




I have a excel file with strings in column A.

A button on that sheet that is linked to the code below.

What it should do is find each string in a pdf file I selectafter using the button.



I don’t see the pdf file when the ribbon open file menu opens.

I am using excel 2016 and adobe reader dc.

I am no expert in vba,found this code

Sub BatchRenameCS()

Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim newPDF As Acrobat.CAcroPDDoc
Dim lastrow2 As Long
Dim strFileName As String
Dim Folder As String
Dim Page As Long
Dim Cell As Long
Dim PDFCharacterCount() As Long
Dim CharacterCount As Long
Dim i As Integer
Dim c As Integer
Dim x As Integer
Dim strSource As String
Dim strResult As String
Dim PDFCharacters As String
Dim PDFCharacters2 As String
Dim PDFPasteData() As String
Dim PasteDataPage As Integer
Dim LastRow As Long
Dim NewName As String
Dim NewNamePageNum As Integer
Dim Check()
Sheets("Sheet1").Range("C:D").ClearContents
strFileName = selectFile()
Folder = GetFolder()
'create array with pdf word count
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
If objPDDoc.Open(strFileName) Then
Set objjso = objPDDoc.GetJSObject
ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long
For Page = 1 To objPDDoc.GetNumPages
PDFCharacters = ""
PDFCharacters2 = ""
For c = 0 To objjso.GetPageNumWords(Page - 1)
PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
Next c
For i = 1 To Len(PDFCharacters)
Select Case Asc(Mid(PDFCharacters, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
Case Else
PDFCharacters2 = PDFCharacters2 & ""
End Select
Next
PDFCharacterCount(Page) = Len(PDFCharacters2)
Next Page
lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
Page = 1
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
For Cell = 1 To lastrow2
strResult = ""
strSource = Sheets("Sheet2").Cells(Cell, 1).Text
PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
For i = 1 To Len(strSource)
Select Case Asc(Mid(strSource, i, 1))
Case 48 To 57, 65 To 90, 97 To 122:
strResult = strResult & (Mid(strSource, i, 1))
Case Else
strResult = strResult & ""
End Select
Next
CharacterCount = CharacterCount + Len(strResult)
If CharacterCount = PDFCharacterCount(Page) Then
CharacterCount = 0
Page = Page + 1
End If
Next Cell
ReDim Check(2, objPDDoc.GetNumPages)
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
For PasteDataPage = 1 To objPDDoc.GetNumPages
If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
If FileExist(Folder & "" & LookUpCell.Offset(0, 1) & ".pdf") Then
Set newPDF = CreateObject("AcroExch.pdDoc")
NewName = Folder & "" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.Open (NewName)
newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
Else
Set newPDF = CreateObject("AcroExch.PDDoc")
newPDF.Create
NewName = Folder & "" & LookUpCell.Offset(0, 1) & ".pdf"
newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
newPDF.Save 1, NewName
newPDF.Close
Set newPDF = Nothing
End If
End If
Next PasteDataPage
Next LookUpCell
x = 1
For PasteDataPage = 1 To objPDDoc.GetNumPages
If Check(1, PasteDataPage) <> 1 Then
Sheets("Sheet1").Cells(x, 3) = PasteDataPage
Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
x = x + 1
End If
Next PasteDataPage
End If
MsgBox "Done"
End Sub
Function FileExist(path As String) As Boolean
If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
If fd.SelectedItems(1) <> vbNullString Then
fileName = fd.SelectedItems(1)
End If
Else
'Exit code if no file is selected
End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
Dim fldr As FileDialog
Dim sItem As String
Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
With fldr
.Title = "Select the Folder where you want you new PDFs to go"
.AllowMultiSelect = False
.InitialFileName = Application.DefaultFilePath
If .Show <> -1 Then GoTo NextCode
sItem = .SelectedItems(1)
End With
NextCode:
GetFolder = sItem
Set fldr = Nothing
End Function

 
Would you be able to help me narrow it down so that it only returns exact matches?
Untested - replace this code:
VBA Code:
                'Extract all page content into a text string
                
                PageText = ""
                For i = 0 To TextSelect.GetNumText - 1
                    PageText = PageText & TextSelect.GetText(i)
                Next
                
                'Search for each cell string in this page
                
                For Each searchString In searchStringCells
                    foundPageNumbers = searchString.Offset(, 1).Value
                    If InStr(1, PageText, searchString.Value, vbTextCompare) > 0 Then
                        If foundPageNumbers = "" Then
                            foundPageNumbers = p + 1
                        Else
                            foundPageNumbers = foundPageNumbers & ", " & p + 1
                        End If
                        searchString.Offset(, 1).Value = foundPageNumbers
                    End If
                Next
with:
VBA Code:
                'Extract all page content into a text string with each word delimited by '|'
                
                PageText = "|"
                For i = 0 To TextSelect.GetNumText - 1
                    PageText = PageText & Trim(TextSelect.GetText(i)) & "|"
                Next
                
                'Search for each cell string in this page
                
                For Each searchString In searchStringCells
                    foundPageNumbers = searchString.Offset(, 1).Value
                    If InStr(1, PageText, "|" & searchString.Value & "|", vbTextCompare) > 0 Then
                        If foundPageNumbers = "" Then
                            foundPageNumbers = p + 1
                        Else
                            foundPageNumbers = foundPageNumbers & ", " & p + 1
                        End If
                        searchString.Offset(, 1).Value = foundPageNumbers
                    End If
                Next
 
Upvote 0

Excel Facts

Get help while writing formula
Click the italics "fx" icon to the left of the formula bar to open the Functions Arguments dialog. Help is displayed for each argument.
@John_w Unfortunately didn't work and instead many strings failed to return numbers. That may have something to do with the real formatting of my strings. All of them are in the format: A3.11.1, A3.11.2(a) etc if you can think of a reason the additional trim code would have cut them out?
 
Upvote 0
The new code isn't just trimming (removing spaces from the start and end of) the 'words'; it is surrounding each extracted word with '|' characters and then searching for your strings, surrounded by '|' characters, thereby looking for the exact string.

I'm unable to test the code at the moment, but I would debug the code and examine the values of TextSelect.GetText(i)) and see if there are any odd characters in the string, including linefeed (vbLf) and carriage return (vbCr) characters.
 
Upvote 0
@John_w - Thanks for all the help, it's really appreciated. Shall take a look with a bit of free time tomorrow and report back.
 
Upvote 0
@John_w - Very new to VBA so apologies if I get this wrong but having tried to debut this morning I think the issue could actually be what we extract to searchStringCells. Taking a single example, if "5.3.1" is in cell A2 then that is extracted and when that searchString is searched on the contents of each page it will accurately identify that "5.3.1" exists in cases of "5.3.1" but also for "5.3.1(a)", "5.3.1(b)" etc and delimiting them won't prevent that.
 
Upvote 0
Taking a single example, if "5.3.1" is in cell A2 then that is extracted and when that searchString is searched on the contents of each page it will accurately identify that "5.3.1" exists in cases of "5.3.1" but also for "5.3.1(a)", "5.3.1(b)" etc and delimiting them won't prevent that.
Delimiting both the string being searched for and every extracted 'word' in the PageText string does prevent that.

For example, if PageText is "|rtqeq|etyuer|5.3.1|qfhge|tirerf|5.3.1(a)|dfdgsd|qery|ryruty|5.3.1(b)|ewrwe|" and you looking for "5.3.1", the InStr is actually looking for "|5.3.1|" and there is only one occurrence of that.

I have found a possible fix. In my testing some words end with CR LF characters, which means that the InStr doesn't find them. To ignore these characters, replace:

VBA Code:
                    PageText = PageText & Trim(TextSelect.GetText(i)) & "|"
with:
VBA Code:
                    PageText = PageText & Replace(Trim(TextSelect.GetText(i)), vbCrLf, "") & "|"
 
Upvote 0
@John_w - That has fixed a number of the issues. There is one miss "A3.3.1" that isn't returning a page number for reasons I don't understand and the revised code seems to mean that any string ending in a bracket (e.g. "A12.4.7(a)") is no longer recognised. Onwards!
 
Upvote 0
If you want me to investigate, please provide a sample PDF containing the "A3.3.1" - upload to a file sharing site and post the link here.
 
Upvote 0
If you want me to investigate, please provide a sample PDF containing the "A3.3.1" - upload to a file sharing site and post the link here.
That would be so helpful, thank you. I'll create one now with anonymised data and upload over the weekend.
 
Upvote 0

Forum statistics

Threads
1,224,823
Messages
6,181,182
Members
453,020
Latest member
Mohamed Magdi Tawfiq Emam

We've detected that you are using an adblocker.

We have a great community of people providing Excel help here, but the hosting costs are enormous. You can help keep this site running by allowing ads on MrExcel.com.
Allow Ads at MrExcel

Which adblocker are you using?

Disable AdBlock

Follow these easy steps to disable AdBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the icon in the browser’s toolbar.
2)Click on the "Pause on this site" option.
Go back

Disable AdBlock Plus

Follow these easy steps to disable AdBlock Plus

1)Click on the icon in the browser’s toolbar.
2)Click on the toggle to disable it for "mrexcel.com".
Go back

Disable uBlock Origin

Follow these easy steps to disable uBlock Origin

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back

Disable uBlock

Follow these easy steps to disable uBlock

1)Click on the icon in the browser’s toolbar.
2)Click on the "Power" button.
3)Click on the "Refresh" button.
Go back
Back
Top