Home | Contact Us | FAQ | Search & Site Map | Link to Us
Sign In | Join | Other 45 Sites in Network
HomeAnnouncementsWhite Papers
Discussion GroupsFirst AidDatabasesJavaBeansGUIJava 3DVirtual MachineCORBASecurityToolsGeneral
Java DirectoryOpen Source ProjectsSample Book ChaptersUser GroupsWeb Resources
Related Topics
Databases.NETMore Topics ...

Java Forum / General / June 2007

Tip: Looking for answers? Try searching our database.

code to clean up texts

Thread view: 
lbrtchx@hotmail.com - 04 Jun 2007 20:43 GMT
Hi,
~
does any one around here know of data analysis/text cleansing
libraries/code to programmatically consolidate lines in a text to
whole paragraphs?
~
does any one around here know of data analysis/text cleansing
libraries/code to programmatically consolidate lines in a text to
whole paragraphs?
~
Say you have this:
~
Four score and seven years ago
our fathers brought forth on this continent,
a new nation, conceived in Liberty,
and dedicated to the proposition
that all men are created equal.
~
to a whole paragraph
~
Four score and seven years ago our fathers brought forth on this
continent, a new nation, conceived in Liberty, and dedicated to the
proposition that all men are created equal.
~
Where can I find them?
~
Thanks
lbrtchx
Hal Rosser - 05 Jun 2007 00:22 GMT
> Hi,
> ~
[quoted text clipped - 24 lines]
> Thanks
> lbrtchx

All you need to do is remove the \n's and \r's and that other character I
don't recall.
Roedy Green - 05 Jun 2007 23:26 GMT
> does any one around here know of data analysis/text cleansing
>libraries/code to programmatically consolidate lines in a text to
>whole paragraphs?

here is a little utility I use called REFLOW.  I have never published
it, so it may be a little crude..

// com.mindprod.reflow.Reflow.java
package com.mindprod.reflow;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Iterator;

/**
 * Reflows lines into paragraph.
 * with lines about the same length
 * paragraphs separated by a single blank line.
 *
 * usage: java com.mindprod.reflow.Reflow file.txt
 * copyright (c) 2003-2007 Roedy Green, Canadian Mind Products
 * #101 - 2536 Wark Street
 * Victoria, BC Canada V8T 4G8
 * tel: (250) 361-9093
 * http://mindprod.com
 *
 * Source and excutables may be freely used for any purpose except
military.
 */
public class Reflow
  {

  /**
   * Max line length of output. ideally would be
   * configurable.
   */
  public static int LINELENGTH = 60;

  private static final String EmbeddedCopyright =
  "copyright (c) 2003-2007 Roedy Green, Canadian Mind Products,
http://mindprod.com";

  // input "before" file
  static String inFilename;
  static File inFile;
  static BufferedReader inReader;

  // output "after" file, the temporary, later renamed to match the
input
  static String outFilename;
  static File outFile;
  static PrintWriter outWriter;

  /**
   * Command line utility to reflow the text.
   */
  public static void main( String[] args )
     {
     try
        {

        analyseCommandLine(args);

        openInReader();    /* Open input "before" file. */
        /* Make sure file exists before */
        /* song and dance about extension. */

        openOutWriter();    /* open output "after" file */

        System.out.println("Reflowing " + inFilename );

        /* copy inReader to outWriter reglowing the text */
        processFiles();

        /* Rename output to input */
        inReader.close();
        outWriter.close();
        inFile.delete();
        outFile.renameTo(inFile);
        // don't delete outFile, it has been renamed to a real file

        }
     catch ( IOException e )
        {
        System.out.print("Oops!  IO failure. e.g. out of disk space.
\n");
        die();
        }

     } // end main

  /**
    * analyse the command line.  It should have a filename
    * case insensitive.
    */
  static void analyseCommandLine(String[] args)
     {
     if ( args.length != 1 )
        {
        banner();
        System.out.println("Oops!  usage:  com.mindprod.reflow.Reflow
Myfile.txt \n");
        die();
        }

     inFilename = args[0]; /* file to convert */
     } // end analyseCommandLine

  /**
    * display a banner about the author
    */
  static void banner()
     {
     /* Usually not displayed, just embedded. */

     System.out.println("\n°±²Û Reflow 1.0 Û²±°"
                        + "\nFreeware to reflow text."
                        + "\ncopyright (c) 2003-2007 Roedy Green,
Canadian Mind Products"
                        + "\n#101 - 2536 Wark Street, Victoria, BC
Canada V8T 4G8"
                        + "\nTelephone: (250) 361-9093
Internet:roedyg@mindprod.com"
                        + "\nMay be used freely for non-military use
only\n\n");

     } // end banner

  /**
    * open the input "before" file
    */
  static void openInReader()
     {
     try
        {
        inFile = new File(inFilename);
        if ( !inFile.exists() )
           {
           banner();
           System.out.print("Oops!  Cannot find file ");
           System.out.println(inFilename);
           die();
           }
        if ( !inFile.canRead() )
           {
           banner();
           System.out.print("Oops!  no permission to read (i.e.
examine) the file ");
           System.out.println(inFilename);
           die();
           }
        if ( !inFile.canWrite() )
           {
           banner();
           System.out.print("Oops!  no permission to write (i.e.
change) the file ");
           System.out.println(inFilename);
           die();
           }

        inReader = new BufferedReader(new FileReader(inFile), 4096 /*
buffsize */);
        }
     catch ( FileNotFoundException e )
        {
        banner();
        System.out.print("Oops!  Cannot open file ");
        System.out.println(inFilename);
        die();
        }
     } // end openInReader

  /**
  * open the output "after" file
  */
  static void openOutWriter()
     {

     try
        {
        // get a temporary file in the same directory as inFile.
        outFile = createTempFile("Reflow", "tmp", inFile);
        outWriter = new PrintWriter(
                                   new BufferedWriter(
                                                     new
FileWriter(outFile), 4096 /* buffsize */),
                                   false /* auto flush */);
        }
     catch ( IOException e )
        {
        System.out.println("Oops!  Cannot create the temporary work
file\n");
        die();
        }

     } // end OpenOutWriter

  /**
   * Create a temporary file,
   * Slightly smarter version of File.createTempFile
   *
   * @param prefix beginning letters of filename
   * @param suffix ending letters of filename.
   * @param near directory where to put file, or file to
   * place this temp file near in the same directory.
   * null means put the temp file in the
   * current directory.
   * @return A temporary file. It will not automatically
   * delete on program completion, however.
   * @exception IOException
   */
  public static File createTempFile ( String prefix , String suffix ,
File near ) throws IOException {
     if ( near != null )
        {
        if ( near.isDirectory () )
           {
           return File.createTempFile ( prefix, suffix, near );
           }
        else if ( near.isFile () )
           {
           String parent = near.getParent();
           if ( parent != null )
              {
              File dir = new File( parent );
              if ( dir.isDirectory () )
                 {
                 return File.createTempFile ( prefix, suffix, dir );
                 }
              }
           }
        }
     // anything else, just create in the current directory.
     return File.createTempFile ( prefix, suffix );
  }

  /**
   * copy inReader to outWriter, reflowing
   * Presume files already open.  Does not close them.
   *
   * @exception IOException
   */
  static void processFiles() throws IOException
  {

     // list of words in paragraph
     ArrayList words = new ArrayList(149);

     // have we just seen an new line.
     // blank lines separate paragraphs
     boolean recentNL = false;

     // the currernt word we are building up.
     StringBuffer word = new StringBuffer( 50 );
     try
        {
        charReadLoop:
        while ( true )
           {
           int c = inReader.read();
           if ( c < 0 ) break charReadLoop;
           switch ( c )
              {
              case 160:
              case ' ':
              case '\t':
                 if ( word.length() != 0 )
                    {
                    words.add( word.toString() );
                    word.setLength( 0 );
                    }
                 break;

              case '\n':
                 if ( word.length() != 0 )
                    {
                    words.add( word.toString() );
                    word.setLength( 0 );
                    }

                 if ( recentNL )
                    {
                    emitParagraph( words, LINELENGTH );
                    words = new ArrayList(149);
                    recentNL = false;
                    }
                 else
                    {
                    recentNL = true;
                    }
                 break;

              case '\r':
                 /* dos has \r\n, unix just \n */
                 /* we just ignore them here and generate them as
needed on \n. */
                 break;

              default:
                 /* ordinary non-blank char */
                 recentNL = false;
                 word.append( (char) c );
                 break;

              } /* end switch */
           } /* end while */

        // dump possible last paragraph without trailing blank line.
        if ( words.size() != 0 )
           {
           emitParagraph( words, LINELENGTH );
           }
        } // end try
     catch ( EOFException e )
        {

        }
  } // end processFiles

  /**
   * emits paragraph followed by blank line.
   *
   * @param words  Array list of words to output
   * @param maxLineLength
   *               maximum line length. If a word is longer
   *               it will not be split.
   */
  static void emitParagraph ( ArrayList words, int maxLineLength )
     {
     /* if paragraph empty, nothing to do */
     if ( words.size() == 0 )
        {
        return;
        }
     int lineLength = 0;
     for ( Iterator iter = words.iterator(); iter.hasNext(); )
        {
        String word = (String) iter.next();
        if ( lineLength + word.length() + 1 > maxLineLength )
           {
           // won't fit. Start a new line.
           if ( lineLength != 0 )
              {
              outWriter.println();
              lineLength = 0;
              }
           // no lead space
           }
        else
           {
           /* will fit */
           if ( lineLength != 0 )
              {
              // add lead space
              outWriter.print( ' ' );
              lineLength++;
              }
           }
        outWriter.print( word );
        lineLength += word.length();

        } // end for

     outWriter.println();
     outWriter.println();
     }
  /**
  * make a noise
  */
  static void honk()
     {
     java.awt.Toolkit.getDefaultToolkit().beep();
     } // end honk

  /**
    * abort the run, clean up as best as possible.
    */
  static void die()
     {
     honk();
     try
        {
        if ( inReader != null ) inReader.close();
        if ( outWriter != null ) outWriter.close();
        }
     catch ( IOException e )
        {

        }
     System.exit(1);   /* exit with errorlevel = 1 */
     } // end die

  } // end class Reflow

--
Roedy Green Canadian Mind Products
The Java Glossary
http://mindprod.com


Free Magazines

Get these publications absolutely FREE for up to 12 months. There are no hidden fees and no obligation. Simply choose a title, complete the application form and submit it. Read more ...

Oracle MagazineNetwork ComputingComputer WorldBio-IT WorldeWeekInformation WeekInfosecurity
 
Sign In
Join
My Latest Posts
My Monitored Threads
My Blog
My Photo Gallery
My Profile
My Homepage

Start New Thread
Enable EMail Alerts
Rate this Thread



©2008 Advenet LLC   Privacy Policy - Terms of Use
This website includes both content owned or controlled by Advenet as well as content owned or controlled by third parties.