[ Team LiB ] Previous Section Next Section

Recipe 26.2 Using a Servlet to Harvest Web Data

Problem

You want to use a servlet to harvest web information.

Solution

Use the HTML parsing API classes of the Java 2 Software Development Kit (SDK).

Discussion

The last recipe introduced the relevant subpackages of the javax.swing.text package; this is where I show how to use them in a servlet. Example 26-3 imports the necessary classes to parse an HTML page. The servlet's doGet( ) method displays a form in which the user enters a stock symbol (such as "INTC," case insensitive).

Then the doPost( ) method attempts to get a live stock quote for that symbol by parsing a web page from finance.yahoo.com.

Example 26-3. Harvesting web data from a servlet
package com.jspservletcookbook;    

import java.io.IOException;  
import java.io.PrintWriter;     
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.MalformedURLException;
import javax.servlet.*;
import javax.servlet.http.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.parser.ParserDelegator;

public class HtmlParseServlet extends HttpServlet {

    private static final String BASE_URL = "http://finance.yahoo.com"+
        "/q?d=t&s=";
    private ParserDelegator htmlParser = null;
    private MyParserCallback callback = null;
    private String htmlText = "";
    private boolean lastTradeFlag = false;
    private boolean boldFlag = false;
    private float stockVal = 0f;
    
  public void doGet(HttpServletRequest request, 
    HttpServletResponse response) throws ServletException, 
      java.io.IOException {
    
      //set the MIME type of the response, "text/html"
      response.setContentType("text/html");
        
          //use a PrintWriter to send text 
      java.io.PrintWriter out = response.getWriter( );
        
          //Begin assembling the HTML content
      out.println("<html><head>");
    
      out.println("<title>Stock Price Fetcher</title></head><body>");
      out.println("<h2>Please submit a valid stock symbol</h2>");
   
      //make sure method="post" so that the servlet service method
      //calls doPost in the response to this form submit
      out.println(
        "<form method=\"post\" action =\"" + request.getContextPath( ) +
            "/stockservlet\" >");

      out.println("<table border=\"0\"><tr><td valign=\"top\">");
      out.println("Stock symbol: </td>  <td valign=\"top\">");
      out.println("<input type=\"text\" name=\"symbol\" size=\"10\">");
      out.println("</td></tr><tr><td valign=\"top\">");

      out.println(
      "<input type=\"submit\" value=\"Submit Info\"></td></tr>");

      out.println("</table></form>");
      out.println("</body></html>");
        

  } //doGet
         
  public void doPost(HttpServletRequest request, 
    HttpServletResponse response)
      throws java.io.IOException{
      String symbol;//this will hold the stock symbol
      float price;//The stock's latest price
      symbol = request.getParameter("symbol");
      boolean isValid = (symbol == null || symbol.length( ) < 1) ?
      false : true;

          //set the MIME type of the response, "text/html"
      response.setContentType("text/html");
      java.io.PrintWriter out = response.getWriter( );
        
          //Begin assembling the HTML content
      out.println("<html><head>");
      out.println("<title>Latest stock value</title></head><body>");
      if (! isValid){
      out.println(
        "<h2>Sorry, the stock symbol parameter was either empty "+
        "or null</h2>");
      } else {
         out.println("<h2>Here is the latest value of "+ symbol +"</h2>");
            price = getLatestPrice(symbol);
            out.println( (price==0? "The symbol is probably invalid." :
          ""+price) );
      }
        out.println("</body></html>");
        
        
  }// doPost
        
  private float getLatestPrice(String symbol) throws IOException,
    MalformedURLException {

       BufferedReader webPageStream = null;
       URL stockSite = new URL(BASE_URL + symbol);
       webPageStream = new BufferedReader(new InputStreamReader(stockSite.
         openStream( )));
       htmlParser = new ParserDelegator( );
       callback = new MyParserCallback( );
       //the code is designed to make calling parse( ) thread-safe
           synchronized(htmlParser){        
           htmlParser.parse(webPageStream,callback,true);
           }//synchronized
       return stockVal;
  }//getLatestPrice
        
  class MyParserCallback extends ParserCallback {

      //bread crumbs that lead us to the stock price
      private boolean lastTradeFlag = false; 
      private boolean boldFlag = false;
  
    public MyParserCallback( ){
      //Reset the enclosing class' instance variable
          if (stockVal != 0)
          stockVal = 0f;
    }
        
    public void handleStartTag(javax.swing.text.html.HTML.Tag t,
      MutableAttributeSet a,int pos) {
        if (lastTradeFlag && (t == javax.swing.text.html.HTML.Tag.B )){
            boldFlag = true;
        }
    }//handleStartTag

    public void handleText(char[] data,int pos){

        htmlText  = new String(data);
        if (htmlText.indexOf("No such ticker symbol.") != -1){
                throw new IllegalStateException(
                  "Invalid ticker symbol in handleText( ) method.");
        }  else if (htmlText.equals("Last Trade:")){
            lastTradeFlag = true;
        } else if (boldFlag){
            try{
                
                stockVal = new Float(htmlText).floatValue( );
            } catch (NumberFormatException ne) {
                try{
                    // tease out any commas in the number using 
                    //NumberFormat
                    java.text.NumberFormat nf = java.text.NumberFormat.
                      getInstance( );
                    Double f = (Double) nf.parse(htmlText);
                    stockVal =  (float) f.doubleValue( );
                } catch (java.text.ParseException pe){
                     throw new IllegalStateException(
                            "The extracted text " + htmlText +
                         " cannot be parsed as a number!");
                 }//try
            }//try
            
            lastTradeFlag = false;
            boldFlag = false;
               }//if
      } //handleText
  }//MyParserCallback
}//HttpServlet

The MyParserCallback inner class defines the parsing algorithm for the servlet, which is explained in Recipe 26.1. The getLatestPrice( ) method uses this callback class and an HTML parser to return a stock quote as a float type.

The ParserDelegator object is synchronized as it calls parse( ), so that only one thread is parsing the web page and setting the value of stockVal (an instance variable representing the stock value) at one time.


This servlet is a little too complicated for one class, as it uses servlet API and HTML parsing API classes. A better design would separate these responsibilities into different Java classes. The upcoming recipes create a JavaBean whose responsibility is to parse HTML for a live stock quote.

Figure 26-1 shows the output of the servlet's doGet( ) method.

Figure 26-1. The user enters a stock symbol and submits the form
figs/jsjc_2601.gif

Figure 26-2 shows the servlet's doPost( ) method output in a Netscape browser.

Figure 26-2. The servlet returns the latest stock price for the symbol
figs/jsjc_2602.gif

See Also

Recipe 26.3 on creating a JavaBean as a web-page parser; Recipe 26.4 and Recipe 26.5 on using the bean with a servlet and a JSP, respectively.

    [ Team LiB ] Previous Section Next Section