G
Garbunkel
Hello,
I am attempting to extract text data from a Microsoft Word document into
Excel. Each separate linebreak and sentence is parsed into a separate cell.
The problem is that there are several portion of the document that have
tables, which have text that I don't want to include.
Is there any way to detect text that is embedded within a table?
The code I'm using is as follows:
Sub Get_Text()
Dim rbreakpt As Long
Dim lbreakpt As Long
Dim roffset As Long
Dim curr_Row As Long
Dim SRS_Sent As String
Dim SRS_Temp As String
Dim SRS_Print As String
Set ActiveWB = ActiveWorkbook
Set objWord = CreateObject("Word.Application")
objWord.Documents.Open "C:\Work\Stuff.doc", False, True
Rem Access the document
Set currentDocument = objWord.Documents(1)
curr_Row = 1
lbreakpt = 1
SRS_Sent =
currentDocument.TablesOfContents.Application.ActiveDocument.Content.Text
SRS_Temp = SRS_Sent
While ((InStr(lbreakpt, SRS_Sent, vbLf, vbTextCompare)) Or _
(InStr(lbreakpt, SRS_Sent, vbCr, vbTextCompare)))
rbreakpt = InStr(lbreakpt, SRS_Sent, Chr(13), vbTextCompare)
roffset = rbreakpt - lbreakpt
SRS_Temp = Mid(SRS_Sent, lbreakpt, roffset)
lbreakpt = rbreakpt + 1
While (InStr(1, SRS_Temp, ".", vbTextCompare) And Len(SRS_Temp) > 2)
SRS_Print = Trim(Left(SRS_Temp, InStr(1, SRS_Temp, ".", _
vbTextCompare)))
ActiveWB.Worksheets(1).Cells(curr_Row, 1).Value = Trim(SRS_Print)
SRS_Temp = Right(SRS_Temp, Len(SRS_Temp) - _
InStr(1, SRS_Temp, ".", vbTextCompare))
'DOC filename
ActiveWB.Worksheets(1).Cells(curr_Row, 2).Value = currentDocument
curr_Row = curr_Row + 1
Wend
'Output rows of any data greater than 1 char
If (Len(SRS_Temp) > 1) Then
curr_Row = curr_Row + 1
'DOC req text
ActiveWB.Worksheets(1).Cells(curr_Row, 1).Value = SRS_Temp
'DOC filename
ActiveWB.Worksheets(1).Cells(curr_Row, 2).Value = currentDocument
End If
Wend
'Close the current document
currentDocument.Close
Set currentDocument = Nothing
End Sub
Thanks in advance.
I am attempting to extract text data from a Microsoft Word document into
Excel. Each separate linebreak and sentence is parsed into a separate cell.
The problem is that there are several portion of the document that have
tables, which have text that I don't want to include.
Is there any way to detect text that is embedded within a table?
The code I'm using is as follows:
Sub Get_Text()
Dim rbreakpt As Long
Dim lbreakpt As Long
Dim roffset As Long
Dim curr_Row As Long
Dim SRS_Sent As String
Dim SRS_Temp As String
Dim SRS_Print As String
Set ActiveWB = ActiveWorkbook
Set objWord = CreateObject("Word.Application")
objWord.Documents.Open "C:\Work\Stuff.doc", False, True
Rem Access the document
Set currentDocument = objWord.Documents(1)
curr_Row = 1
lbreakpt = 1
SRS_Sent =
currentDocument.TablesOfContents.Application.ActiveDocument.Content.Text
SRS_Temp = SRS_Sent
While ((InStr(lbreakpt, SRS_Sent, vbLf, vbTextCompare)) Or _
(InStr(lbreakpt, SRS_Sent, vbCr, vbTextCompare)))
rbreakpt = InStr(lbreakpt, SRS_Sent, Chr(13), vbTextCompare)
roffset = rbreakpt - lbreakpt
SRS_Temp = Mid(SRS_Sent, lbreakpt, roffset)
lbreakpt = rbreakpt + 1
While (InStr(1, SRS_Temp, ".", vbTextCompare) And Len(SRS_Temp) > 2)
SRS_Print = Trim(Left(SRS_Temp, InStr(1, SRS_Temp, ".", _
vbTextCompare)))
ActiveWB.Worksheets(1).Cells(curr_Row, 1).Value = Trim(SRS_Print)
SRS_Temp = Right(SRS_Temp, Len(SRS_Temp) - _
InStr(1, SRS_Temp, ".", vbTextCompare))
'DOC filename
ActiveWB.Worksheets(1).Cells(curr_Row, 2).Value = currentDocument
curr_Row = curr_Row + 1
Wend
'Output rows of any data greater than 1 char
If (Len(SRS_Temp) > 1) Then
curr_Row = curr_Row + 1
'DOC req text
ActiveWB.Worksheets(1).Cells(curr_Row, 1).Value = SRS_Temp
'DOC filename
ActiveWB.Worksheets(1).Cells(curr_Row, 2).Value = currentDocument
End If
Wend
'Close the current document
currentDocument.Close
Set currentDocument = Nothing
End Sub
Thanks in advance.