M
Mark
Hello
I was hoping to get some tips on how to get what is in between
<Title>....</Title> tags in a text file containing HTML source code
into column 3. The VBA code searches through many folders and
subfolders finding any file that is .html and reading it. I tried
parsing but can't seem to get it to work. Any ideas? Here is the code
that I have (that doesn't work)
Sub CheckTextFilesForHREFs()
MsgBox "Press OK to begin report"
Dim WholeLine As String
Dim myPath As String
Dim workfile As String
Dim myR As Long
myPath = "C:\Exelon\"
workfile = Dir(myPath & "*.html")
'sLine = WholeLine
Set fs = Application.FileSearch
With fs
..LookIn = "C:\Exelon"
..Filename = ".html"
..SearchSubFolders = True
'.FileType = mosFileTypeAllFiles
If .Execute(SortBy:=msoSortByFileName, _
SortOrder:=msoSortOrderAscending) > 0 Then
MsgBox "There were " & .FoundFiles.Count & _
" file(s) found."
For i = 1 To .FoundFiles.Count
ParseURL .FoundFiles(i)
ParseTitle .FoundFiles(i)
ParseLink .FoundFiles(i)
Next i
Else
MsgBox "There were no files found."
End If
End With
Sub ParseTitle(strFile As String)
Dim strTxt As String, lngTxt As Long, i As Long, oMatches
Dim ws As Worksheet, j As Long, k As Long, m As Long, oMatches2
Dim reg, oMatches3, reg2
i = FreeFile
'strFile = "c:\Users\Richard\Documents\Htmltest.html"
lngTxt = FileLen(strFile)
strTxt = Space(lngTxt)
Open strFile For Binary Access Read As #i
Get #i, , strTxt
Close #i
Debug.Print strTxt
With CreateObject("vbscript.regexp")
..Global = True
..ignorecase = True
..Pattern = vbCrLf & ".*?title.*?(?=" & vbCrLf & ")"
If .test(strTxt) Then
Set oMatches = .Execute(strTxt)
For i = 0 To oMatches.Count - 1
Set reg = CreateObject("vbscript.regexp")
With reg
..Global = True
..ignorecase = True
..Pattern = "<title>\""(.*?)\"""
k = Cells(Rows.Count, 1).End(xlUp).Offset(1).Row
Cells(k, 1).Value = strFile
If .test(oMatches(i)) Then
Set oMatches2 = .Execute(oMatches(i))
For j = 0 To oMatches2.Count - 1
Cells(k, j + 3) = .Replace(oMatches2(j), "$1")
Next j
End If
End With
Next i
End If
End With
End Sub
I was hoping to get some tips on how to get what is in between
<Title>....</Title> tags in a text file containing HTML source code
into column 3. The VBA code searches through many folders and
subfolders finding any file that is .html and reading it. I tried
parsing but can't seem to get it to work. Any ideas? Here is the code
that I have (that doesn't work)
Sub CheckTextFilesForHREFs()
MsgBox "Press OK to begin report"
Dim WholeLine As String
Dim myPath As String
Dim workfile As String
Dim myR As Long
myPath = "C:\Exelon\"
workfile = Dir(myPath & "*.html")
'sLine = WholeLine
Set fs = Application.FileSearch
With fs
..LookIn = "C:\Exelon"
..Filename = ".html"
..SearchSubFolders = True
'.FileType = mosFileTypeAllFiles
If .Execute(SortBy:=msoSortByFileName, _
SortOrder:=msoSortOrderAscending) > 0 Then
MsgBox "There were " & .FoundFiles.Count & _
" file(s) found."
For i = 1 To .FoundFiles.Count
ParseURL .FoundFiles(i)
ParseTitle .FoundFiles(i)
ParseLink .FoundFiles(i)
Next i
Else
MsgBox "There were no files found."
End If
End With
Sub ParseTitle(strFile As String)
Dim strTxt As String, lngTxt As Long, i As Long, oMatches
Dim ws As Worksheet, j As Long, k As Long, m As Long, oMatches2
Dim reg, oMatches3, reg2
i = FreeFile
'strFile = "c:\Users\Richard\Documents\Htmltest.html"
lngTxt = FileLen(strFile)
strTxt = Space(lngTxt)
Open strFile For Binary Access Read As #i
Get #i, , strTxt
Close #i
Debug.Print strTxt
With CreateObject("vbscript.regexp")
..Global = True
..ignorecase = True
..Pattern = vbCrLf & ".*?title.*?(?=" & vbCrLf & ")"
If .test(strTxt) Then
Set oMatches = .Execute(strTxt)
For i = 0 To oMatches.Count - 1
Set reg = CreateObject("vbscript.regexp")
With reg
..Global = True
..ignorecase = True
..Pattern = "<title>\""(.*?)\"""
k = Cells(Rows.Count, 1).End(xlUp).Offset(1).Row
Cells(k, 1).Value = strFile
If .test(oMatches(i)) Then
Set oMatches2 = .Execute(oMatches(i))
For j = 0 To oMatches2.Count - 1
Cells(k, j + 3) = .Replace(oMatches2(j), "$1")
Next j
End If
End With
Next i
End If
End With
End Sub