Option Explicit
'
' Unwraps and formats raw scans of books.
' Removes excessive blank lines and spaces,
' removes page numbers and headers. Ignores
' any file that appears to already have
' unwrapped text.
'
' Written by Eric Phelps
' http://www.ericphelps.com
'
' Public Domain. Distribute freely. NO WARRANTY
' Just because it worked a few times for me on my computer,
' with my version of scripting, and on the few test files
' I checked does NOT mean it will work for you! Consider
' this code a STARTING POINT, not a finished product!
'
' This program may be automated since it was DESIGNED
' not to pop up any message boxes or error dialogs
' requiring user intervention. In theory.
'
' Typical Windows 95 DOS batch command line:
' for %%x in (*.txt) do start /w cscript.exe unwrap.vbs %%x
' You can also drop etext text files on this script.
'
Main
Wscript.Quit 0

Sub Main()
Dim filEtextFile 'As Scripting.File
Dim strEtextFileName 'As String
Dim tsEtext 'As Scripting.TextStream
Dim strEtextContent 'As String
Dim strEtextLine 'As String
Dim strOldEtextLine 'As String
Dim lngPosition, lngStart, lngEnd 'As Long
Dim intCounter 'As Integer
Dim intUnwrapped 'As Integer
Dim blnPunctuation 'As Boolean
Dim fs 'As Scripting.FileSystemObject
Const ForReading = 1 'Scripting.IOMode
Const ForWriting = 2 'Scripting.IOMode

    On Error Resume Next

    '''''''''' Get the file name passed as an argument
    Set fs = CreateObject("Scripting.FileSystemObject")
    If Wscript.Arguments.Count <> 1 Then
        CreateObject("WScript.Shell").Popup "You must pass a text file name on the command line!", 5, "Error"
        Wscript.Quit 1
    End If
    strEtextFileName = Wscript.Arguments(0)
    If Not fs.FileExists(strEtextFileName) Then
        CreateObject("WScript.Shell").Popup Wscript.Arguments(0) & " is not a legitimate file name.", 5, "Error"
        Wscript.Quit 1
    End If
    If Lcase(Right(strEtextFileName, 4)) <> ".txt" Then
        CreateObject("WScript.Shell").Popup Wscript.Arguments(0) & " is not a text (.TXT) file name.", 5, "Error"
        Wscript.Quit 1
    End If
    Set filEtextFile = fs.GetFile(strEtextFileName)
    Status "Processing file: " & filEtextFile.Name
    
    
    'Read the first few lines to see if it is unwrapped
    Set tsEtext = filEtextFile.OpenAsTextStream(ForReading)
    'Get past the titles and introductions into the body
    For intCounter = 1 to 100
        strEtextLine = tsEtext.ReadLine
    Next
    'Check the body
    intUnwrapped = 0
    For intCounter = 1 to 100
        strEtextLine = Trim(tsEtext.ReadLine)
        If Len(strEtextLine) > 128 Then
            If Instr(1, "ABCDEFGHIJKLMNOPQRSTUVWXYZ""", Left(strEtextLine, 1), vbBinaryCompare) <> 0 Then
                intUnWrapped = intUnWrapped + 1
            End If
        End If
    Next
    If intUnWrapped > 20 Then 
        Status "This text is already unwrapped."
        Wscript.Quit 0
    Else
        Status "This text is not unwrapped."
    End If
    
    'Read the entire file for unwrapping
    Set tsEtext = filEtextFile.OpenAsTextStream(ForReading)
    Status "Loading etext file into memory for processing..."
    strEtextContent = tsEtext.ReadAll
    tsEtext.Close
    
    ''''''''''Unwrap the text
    'Recover from stupid FTP text download
    strEtextContent = Replace(strEtextContent, vbCr & vbCrLf, vbCrLf)
    'Remove excessive linefeeds
    Do While Instr(strEtextContent, vbCrLf & vbCrLf & vbCrLf) <> 0
        strEtextContent = Replace(strEtextContent, vbCrLf & vbCrLf & vbCrLf, vbCrLf & vbCrLf)
    Loop
    Do While Instr(strEtextContent, vbLf & vbLf & vbLf) <> 0
        strEtextContent = Replace(strEtextContent, vbLf & vbLf & vbLf, vbLf & vbLf)
    Loop
    Do While Instr(strEtextContent, vbCr & vbCr & vbCr) <> 0
        strEtextContent = Replace(strEtextContent, vbCr & vbCr & vbCr, vbCr & vbCr)
    Loop
    'Remove hyphenation
    Status "Unwrapping hyphenated words"
    strEtextContent = Replace(strEtextContent, "-" & vbCrLf, "")
    strEtextContent = Replace(strEtextContent, "-" & vbCr, "")
    strEtextContent = Replace(strEtextContent, "-" & vbLf, "")
    'Save double linefeeds temporarily
    Status "Saving double carriage returns"
    strEtextContent = Replace(strEtextContent, vbCrLf & vbCrLf, "^P")
    strEtextContent = Replace(strEtextContent, vbCr & vbCr, "^P")
    strEtextContent = Replace(strEtextContent, vbLf & vbLf, "^P")
    'Unwrap all text
    Status "Unwrapping text"
    strEtextContent = Replace(strEtextContent, vbCrLf, " ")
    strEtextContent = Replace(strEtextContent, vbCr, " ")
    strEtextContent = Replace(strEtextContent, vbLf, " ")
    'Replace the saved double linefeeds with single linefeeds
    Status "Restoring double carriage returns"
    strEtextContent = Replace(strEtextContent, "^P", vbCrLf)
    'Replace double spaces with single spaces.
    Status "Replacing double spaces with single spaces"
    Do While Instr(strEtextContent, "  ")
        strEtextContent = Replace(strEtextContent, "  ", " ")
    Loop
    Status "Replacing single quotes with double quotes"
    Do While Instr(strEtextContent, "''")
        strEtextContent = Replace(strEtextContent, "''", """")
    Loop
    Do While Instr(strEtextContent, " '")
        strEtextContent = Replace(strEtextContent, " '", " """)
    Loop
    Do While Instr(strEtextContent, "' ")
        strEtextContent = Replace(strEtextContent, "' ", """ ")
    Loop
    Do While Instr(strEtextContent, "'.")
        strEtextContent = Replace(strEtextContent, "'.", """.")
    Loop
    Do While Instr(strEtextContent, "'" & vbCrLf)
        strEtextContent = Replace(strEtextContent, "'" & vbCrLf, """" & vbCrLf)
    Loop
    Do While Instr(strEtextContent, vbCrLf & "'")
        strEtextContent = Replace(strEtextContent, vbCrLf & "'", vbCrLf & """")
    Loop
    Status "Replacing common mis-scans"
    Do While Instr(strEtextContent, "Tm")
        strEtextContent = Replace(strEtextContent, "Tm", "I'm")
    Loop
    Do While Instr(strEtextContent, " 1 ")
        strEtextContent = Replace(strEtextContent, " 1 ", " I ")
    Loop
    
    ''''''''''Write the changed text back to the file
    Status "Saving unwrapped text to file"
    Set tsEtext = filEtextFile.OpenAsTextStream(ForWriting)
    tsEtext.Write strEtextContent
    strEtextContent = ""
    tsEtext.Close
    
    Status "Removing page numbers and embedded title lines"
    Set tsEtext = filEtextFile.OpenAsTextStream(ForReading)
    'Read the first four lines and allow anything.
    strEtextContent = tsEtext.ReadLine & vbCrLf
    strEtextContent = strEtextContent & tsEtext.ReadLine & vbCrLf
    strEtextContent = strEtextContent & tsEtext.ReadLine & vbCrLf
    strOldEtextLine = Trim(tsEtext.ReadLine)
    'Read the rest of the file, removing page numbers and titles, etc..
    Do Until tsEtext.AtEndOfStream
        strEtextLine = tsEtext.ReadLine
        strEtextLine = Trim(strEtextLine)
        If strEtextLine <> "" Then
            'Remove lines that contain tabs
            If Instr(strEtextLine, vbTab) <> 0 Then
                strEtextLine = " "
            End If
            'Remove short lines that start with numbers
            If Instr("123456789", Left(strEtextLine, 1)) Then
                If Len(strEtextLine) < 60 Then
                    If Right(strEtextLine, 1) <> "." Then
                        strEtextLine = " "
                    End If
                End If
            End If
            'Remove short lines that end with numbers
            If Instr("0123456789", Right(strEtextLine, 1)) Then
                If Len(strEtextLine) < 60 Then
                    strEtextLine = " "
                End If
            End If
            'Remove short lines that have no ending punctuation
            If Len(strEtextLine) < 40 Then
                blnPunctuation = False
                If Right(strEtextLine, 1) = "." Then blnPunctuation = True
                If Right(strEtextLine, 1) = "!" Then blnPunctuation = True
                If Right(strEtextLine, 1) = "?" Then blnPunctuation = True
                If Right(strEtextLine, 1) = """" Then blnPunctuation = True
                If Not blnPunctuation Then strEtextLine = " "
            End If
        End If
        'Four possibilities: good-good, good-bad, bad-good, bad-bad
        'Good lines are real lines. Bad lines are header lines 
        'that have been turned into a single " " by the above code.
        'Good-good
        If ((strOldEtextLine <> " ") And (strEtextLine <> " ")) Then
            strEtextContent = strEtextContent & strOldEtextLine & vbCrLf
        End If
        'Good-bad
        If ((strOldEtextLine <> " ") And (strEtextLine = " ")) Then
            strEtextContent = strEtextContent & strOldEtextLine & " "
        End If
        'Bad-good
        If ((strOldEtextLine = " ") And (strEtextLine <> " ")) Then
            'Do nothing
        End If
        'Bad-bad
        If ((strOldEtextLine = " ") And (strEtextLine = " ")) Then
            'Do nothing
        End If
        strOldEtextLine = strEtextLine
    Loop
    strEtextContent = strEtextContent & strOldEtextLine
    
    ''''''''''Write the changed text back to the file
    Status "Saving changes for the last time"
    Set tsEtext = filEtextFile.OpenAsTextStream(ForWriting)
    tsEtext.Write strEtextContent
    strEtextContent = ""
    tsEtext.Close
    
    'Finished! Tell the world.
    CreateObject("WScript.Shell").Popup "Finished " & filEtextFile.Name, 2, "Success"
End Sub


Sub Status (strMessage)
'If the program was run with CSCRIPT, this writes a
'line into the DOS box. If run with WSCRIPT, it does nothing.
    If Lcase(Right(Wscript.FullName, 12)) = "\cscript.exe" Then 
        Wscript.Echo strMessage
    End If
End Sub

