Hello everyone,
I am trying to use script below to do OCR, but it does not work. I found problem in part "Shell". Simply, it does not convert PDF to JPG.
Does anyone know how to fix?
Thank you !
P.S. Programs tesseract and magick are installed and manualy working fine.
I am trying to use script below to do OCR, but it does not work. I found problem in part "Shell". Simply, it does not convert PDF to JPG.
Does anyone know how to fix?
VBA Code:
Option Explicit
#If Win64 Then 'depending on 64 or 32 bit windows
Public Declare PtrSafe Sub Sleep Lib "kernel32" ( _
ByVal dwMilliseconds As Long)
Public Declare PtrSafe Function APIMsgBox Lib "user32" Alias "MessageBoxA" _
(Optional ByVal hWnd As Long, _
Optional ByVal prompt As String, _
Optional ByVal title As String, _
Optional ByVal buttons As Long) _
As Long
Public Declare PtrSafe Function FindWindow Lib "user32" Alias "FindWindowA" ( _
ByVal lpClassName As String, _
ByVal lpWindowName As String) As LongPtr
Public Declare PtrSafe Function GetDesktopWindow Lib "user32" () _
As Long
Public Declare PtrSafe Function FindWindowEx Lib "user32" Alias "FindWindowExA" _
(ByVal hWnd1 As Long, _
ByVal hWnd2 As Long, _
ByVal lpsz1 As String, _
ByVal lpsz2 As String) _
As Long
#Else
Public Declare Sub Sleep Lib "kernel32" ( _
ByVal dwMilliseconds As Long)
Public Declare Function APIMsgBox Lib "user32" Alias "MessageBoxA" _
(Optional ByVal hWnd As Long, _
Optional ByVal prompt As String, _
Optional ByVal title As String, _
Optional ByVal buttons As Long) _
As Long
Public Declare Function FindWindow Lib "user32" Alias "FindWindowA" ( _
ByVal lpClassName As String, _
ByVal lpWindowName As String) As LongPtr
Declare Function GetDesktopWindow Lib "user32" () _
As Long
Private Declare Function FindWindowEx Lib "user32" Alias "FindWindowExA" _
(ByVal hWnd1 As Long, _
ByVal hWnd2 As Long, _
ByVal lpsz1 As String, _
ByVal lpsz2 As String) _
As Long
#End If
'Tools:
'- Magick
'- Tesseract
'References:
'- Visual Basic for Applications
'- Microsoft Excel 16.0 Object Library
'- OLE Automation
'- Microsoft Office 16.0 Office Library
'- Microsoft Scripting Runtime
'Sub procedures:
'PDF_to_Txt_Part_01_Loop_through_files
'PDF_to_Txt_Part_02_Convert_pdf_to_jpg
'PDF_to_Txt_Part_03_Read_text_from_jpg
'PDF_to_Txt_Part_04_Wait_for_Magick_and_Tesseract_to_finish
'PDF_to_Txt_Part_05_Combining_all_text_files_into_1
'NOTE each sub works on its own. There are 5 parts for educational purpose.
'If you just want to final result then go to part 05
'Last successfull testrun of all subs on 17.01.2021
Sub PDF_to_Txt_Part_01_Loop_through_files()
Dim FSO As Scripting.FileSystemObject
Dim oFolderPDF As Scripting.Folder
Dim oFile As Scripting.File
Dim sFolderPDF As String
Set FSO = CreateObject("Scripting.FileSystemObject")
sFolderPDF = "C:\Users\logistics.rs\Desktop\PDFs"
Set oFolderPDF = FSO.GetFolder(sFolderPDF)
For Each oFile In oFolderPDF.Files
Debug.Print oFile.Path
Next oFile
Debug.Print "The end"
End Sub
Sub PDF_to_Txt_Part_02_Convert_pdf_to_jpg()
Dim FSO As Scripting.FileSystemObject
Dim oFolderPDF As Scripting.Folder
Dim oFile As Scripting.File
Dim sFolderPDF As String
Dim sMagick As String: sMagick = """" & "C:\Program Files\ImageMagick-7.0.11-Q16-HDRI\magick.exe" & """"
Set FSO = CreateObject("Scripting.FileSystemObject")
sFolderPDF = "C:\Users\logistics.rs\Desktop\PDFs\"
Set oFolderPDF = FSO.GetFolder(sFolderPDF)
For Each oFile In oFolderPDF.Files
'Debug.Print oFile.Path
'Debug.Print sMagick & " " & """" & oFile.Path & """" & " " & """" & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """"
Call Shell(sMagick & " " & """" & oFile.Path & """" & " " & """" & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """", vbNormalFocus) 'Run Magick: PDF to JPG
RetVal = Shell("cmd.exe /c convert """ & sLocalFile & """ """ & sLocalFile & ".jpg""", vbMaximizedFocus)
Next oFile
Debug.Print "The end"
End Sub
Sub PDF_to_Txt_Part_03_Read_text_from_jpg()
Dim FSO As Scripting.FileSystemObject
Dim oFolderPDF As Scripting.Folder
Dim oFile As Scripting.File
Dim sFolderPDF As String
Dim sMagick As String: sMagick = """" & "C:\Program Files\ImageMagick-7.0.11-Q16-HDRI\magick.exe" & """"
Dim sTesseract As String: sTesseract = """" & "C:\Program Files\Tesseract-OCR\tesseract.exe" & """"
Set FSO = CreateObject("Scripting.FileSystemObject")
sFolderPDF = "C:\Users\logistics.rs\Desktop\PDFs"
Set oFolderPDF = FSO.GetFolder(sFolderPDF)
'Convert from PDF to JPG
For Each oFile In oFolderPDF.Files
'Debug.Print oFile.Path
Debug.Print sMagick & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """"
Call Shell(sMagick & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """", vbNormalFocus) 'Run Magick: PDF to JPG
Next oFile
'Something needs to be added to wait before the conversion to finish
'Get text from JPG
For Each oFile In oFolderPDF.Files
Debug.Print oFile.Type
If oFile.Type = "JPG File" Then
Debug.Print oFile.Name
Debug.Print sTesseract & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 4) & """"
Call Shell(sTesseract & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 4) & """", vbNormalFocus)
End If
Next oFile
Debug.Print "The end"
End Sub
Sub PDF_to_Txt_Part_04_Wait_for_Magick_and_Tesseract_to_finish()
Dim FSO As Scripting.FileSystemObject
Dim oFolderPDF As Scripting.Folder
Dim oFile As Scripting.File
Dim sFolderPDF As String
Dim sMagickQuoted As String: sMagickQuoted = """" & "C:\Program Files\ImageMagick-7.0.11-Q16-HDRI\magick.exe" & """"
Dim sMagick As String: sMagick = "C:\Program Files\ImageMagick-7.0.11-Q16-HDRI\magick.exe"
Dim sTesseractQuoted As String: sTesseractQuoted = """" & "C:\Program Files\Tesseract-OCR\tesseract.exe" & """"
Dim sTesseract As String: sTesseract = "C:\Program Files\Tesseract-OCR\tesseract.exe"
Set FSO = CreateObject("Scripting.FileSystemObject")
sFolderPDF = "C:\Users\logistics.rs\Desktop\PDFs\"
Set oFolderPDF = FSO.GetFolder(sFolderPDF)
'Convert from PDF to JPG
For Each oFile In oFolderPDF.Files
'Debug.Print oFile.Path
Debug.Print sMagickQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """"
Call Shell(sMagickQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """", vbNormalFocus) 'Run Magick: PDF to JPG
Next oFile
If app_finished(sMagick) Then
Debug.Print "OK"
Else
Debug.Print "KO"
End If
'Get text from JPG
For Each oFile In oFolderPDF.Files
Debug.Print oFile.Type
If oFile.Type = "JPG File" Then
Debug.Print oFile.Name
Debug.Print sTesseractQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 4) & """"
Call Shell(sTesseractQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 4) & """", vbNormalFocus)
End If
Next oFile
If app_finished(sTesseract) Then
Debug.Print "OK"
Else
Debug.Print "KO"
End If
Debug.Print "The end"
End Sub
Function app_finished(sName As String) As Boolean
Dim sAPI As String
Dim i As Integer
sAPI = FindWindow(vbNullString, sName)
i = 0
Do Until sAPI <> "0" 'Catch the screen
Sleep 50
sAPI = FindWindow(vbNullString, sName)
i = i + 1
Loop
If i >= 50 Then
Exit Function
End If
i = 0
Do Until sAPI = "0" 'loop until the screen is away
Sleep 500
sAPI = FindWindow(vbNullString, sName)
i = i + 1
Loop
If i >= 50 Then
Exit Function
End If
app_finished = True
End Function
Sub PDF_to_Txt_Part_05_Combining_all_text_files_into_1()
Dim FSO As Scripting.FileSystemObject
Dim oFolderPDF As Scripting.Folder
Dim oFile As Scripting.File
Dim oOutput As Scripting.TextStream
Dim oInput As Scripting.TextStream
Dim sFolderPDF As String
Dim sMagickQuoted As String: sMagickQuoted = """" & "C:\Program Files\ImageMagick-7.0.11-Q16-HDRI\magick.exe" & """"
Dim sMagick As String: sMagick = "C:\Program Files\ImageMagick-7.0.11-Q16-HDRI\magick.exe"
Dim sTesseractQuoted As String: sTesseractQuoted = """" & "C:\Program Files\Tesseract-OCR\tesseract.exe" & """"
Dim sTesseract As String: sTesseract = "C:\Program Files\Tesseract-OCR\tesseract.exe"
Dim ws As Worksheet: Set ws = ThisWorkbook.Worksheets("PDFs")
Dim iRow As Integer: iRow = 1
Dim sPDFExcel As String
Dim sPDFFolder As String
Dim iPageCounter As Integer: iPageCounter = 1
Set FSO = CreateObject("Scripting.FileSystemObject")
sFolderPDF = "C:\Users\logistics.rs\Desktop\PDFs\"
Set oFolderPDF = FSO.GetFolder(sFolderPDF)
'Convert from PDF to JPG
For Each oFile In oFolderPDF.Files
'Debug.Print oFile.Path
Debug.Print sMagickQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """"
If oFile.Type = "PDF File" Then
Call Shell(sMagickQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 3) & "jpg" & """", vbNormalFocus) 'Run Magick: PDF to JPG
ws.Cells(iRow, 1) = oFile.Name
iRow = iRow + 1
End If
Next oFile
If app_finished(sMagick) Then
Debug.Print "OK"
Else
Debug.Print "KO"
End If
'Get text from JPG
For Each oFile In oFolderPDF.Files
Debug.Print oFile.Type
If oFile.Type = "JPG File" Then
Debug.Print oFile.Name
Debug.Print sTesseractQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 4) & """"
Call Shell(sTesseractQuoted & " """ & oFile.Path & """ """ & Left(oFile.Path, Len(oFile.Path) - 4) & """", vbNormalFocus)
End If
Next oFile
If app_finished(sTesseract) Then
Debug.Print "OK"
Else
Debug.Print "KO"
End If
'Put all the txt files into 1
iRow = 1
Do Until ws.Cells(iRow, 1) = vbNullString
For Each oFile In oFolderPDF.Files
Debug.Print oFile.Name
sPDFExcel = Left(ws.Cells(iRow, 1), Len(ws.Cells(iRow, 1)) - 4)
sPDFFolder = Left(oFile.Name, Len(ws.Cells(iRow, 1)) - 4)
Debug.Print sPDFExcel
Debug.Print sPDFFolder
If sPDFExcel = sPDFFolder Then
If oFile.Type <> "PDF File" Then
If oFile.Type = "Text Document" Then
Set oInput = FSO.OpenTextFile(oFile.Path, ForReading)
If oOutput Is Nothing Then
Set oOutput = FSO.CreateTextFile(oFile.ParentFolder & "/" & sPDFExcel & "_FULL.txt", True)
End If
oOutput.WriteLine (oInput.ReadAll)
oOutput.WriteLine (" _-_-_-_-_-_-_-_-_- Page " & iPageCounter & " _-_-_-_-_-_-_-_-_- ")
iPageCounter = iPageCounter + 1
oInput.Close
End If
oFile.Delete
End If
End If
Next oFile
iRow = iRow + 1
iPageCounter = 1
oOutput.Close
Set oOutput = Nothing
Loop
Debug.Print "The end"
End Sub
Thank you !
P.S. Programs tesseract and magick are installed and manualy working fine.