/* * HTML Parser * Copyright (C) 1997 David McNicol * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * file COPYING for more details. */ package cvu.html; import java.util.Enumeration; import java.util.Vector; /** * This represents a single HTML tag. Each TagToken has a name and a * list of attributes and values. * @see HTMLTokenizer * @author David McNicol */ public class TagToken { /** Identifies the escape character. */ public static final char ESCAPE = '\\'; /** Identifies the double quotation character. */ public static final char DOUBLE_QUOTE = '"'; /** Identifies the single quotation character. */ public static final char SINGLE_QUOTE = '\''; /** Stores the name of the TagToken. */ private String name; /** Indicates whether the TagToken is an end-token. */ private boolean end = false; /** Stores a list of attributes and their values. */ private AttributeList attr; /** * Constructs a new TagToken converting the specified string * into a token name and a list of attributes with values. * @param line the raw data. */ public TagToken (String line) { name = null; attr = new AttributeList(); tokenizeAttributes(line); } /** * Returns the name of the TagToken. */ public String getName () { return name; } /** * Returns the attribute list of the TagToken. */ public AttributeList getAttributes () { return attr; } /** * Indicates whether this token is an end tag. */ public boolean isEndTag () { return end; } /** * Returns true if the given attribute exists. * @param name the name of the attribute. */ public boolean isAttribute (String name) { return attr.exists(name); } /** * Returns the value of the specified attribute or null if the * attribute does not exist. * @param name the name of the attribute. */ public String getAttribute (String name) { return attr.get(name); } /** * Returns an attribute with all double quote characters * escaped with a backslash. * @param name the name of the attribute. */ public String getQuotedAttribute (String name) { // Check that the attribute list is there. if (attr == null) return null; // Return the quoted version. return attr.getQuoted(name); } /** * Returns a string version of the attribute and its value. * @param name the name of the attribute. */ public String getAttributeToString (String name) { // Check that the attribute list is there. if (attr == null) return null; // Return the string version. return attr.toString(name); } /** * Returns a string version of the TagToken. */ public String toString () { StringBuffer sb; // Stores the string to be returned. Enumeration list; // List of node's arguments or children. // Get a new StringBuffer. sb = new StringBuffer(); // Write the opening of the tag. if (end) sb.append(" 0) { // Print string version of the attributes. sb.append(' ').append(attr.toString()); } // Finish off the tag. sb.append('>'); // Return the string version. return sb.toString(); } /** * Sets the name of the token and also whether it is a begin * or an end token. * @param name the name of the token. */ private void setName (String name) { if (name == null) { this.name = null; return; } String lcname = name.toLowerCase(); if (lcname.length() > 0 && lcname.charAt(0) == '/') { this.name = lcname.substring(1); end = true; } else { this.name = lcname; } } /** * Adds a attribute and value to the list. * @param name the name of the attribute. * @param value the value of the attribute. */ private void setAttribute (String name, String value) { attr.set(name, value); } /** * Adds a attribute to the list using the given string. The string * may either be in the form 'attribute' or 'attribute=value'. * @param s contains the attribute information. */ private void setAttribute (String s) { int idx; // The index of the = sign in the string. String name; // Stores the name of the attribute. String value; // Stores the value of the attribute. // Check if the string is null. if (s == null) return; // Get the index of = within the string. idx = s.indexOf('='); // Check if there was '=' character present. if (idx < 0) { // If not, add the whole string as the attribute // name with a null value. setAttribute(s, ""); } else { // If so, split the string into a name and value. name = s.substring(0, idx); value = s.substring(idx + 1); // Add the name and value to the attribute list. setAttribute(name, value); } } /** * Tokenizes the given string and uses the resulting vector * to to build up the TagToken's attribute list. * @param args the string to tokenize. */ private void tokenizeAttributes (String args) { Vector v; // Vector of tokens from the string. Enumeration e; // Enumeration of vector elements. String[] tokens = null; // Array of tokens from vector. int length; // Size of the vector. int i; // Loop variable. // Get the vector of tokens. v = tokenizeString(args); // Check it is not null. if (v == null) return; // Create a new String array. length = v.size() - 1; if (length > 0) tokens = new String[length]; // Get an enumeration of the vector's elements. e = v.elements(); // Store the first element as the TagToken's name. setName((String) e.nextElement()); // Stop processing now if there are no more elements. if (! e.hasMoreElements()) return; // Put the rest of the elements into the string array. i = 0; while (e.hasMoreElements()) tokens[i++] = (String) e.nextElement(); // Deal with the name/value pairs with separate = signs. for (i = 1; i < (length - 1); i++) { if (tokens[i] == null) continue; if (tokens[i].equals("=")) { setAttribute(tokens[i - 1], tokens[i + 1]); tokens[i] = null; tokens[i - 1] = null; tokens[i + 1] = null; } } // Deal with lone attributes and joined name/value pairs. for (i = 0; i < length; i++) if (tokens[i] != null) setAttribute(tokens[i]); } /** * This method tokenizes the given string and returns a vector * of its constituent tokens. It understands quoting and character * escapes. * @param s the string to tokenize. */ private Vector tokenizeString (String s) { // First check that the args are not null or zero-length. if (s == null || s.length() == 0) return null; boolean whitespace = false; // True if we are reading w/space. boolean escaped = false; // True if next char is escaped. boolean quoted = false; // True if we are in quotes. int length; // Length of attribute string. int i = 0; // Loop variable. // Create a vector to store the complete tokens. Vector tokens = new Vector(); // Create a buffer to store an individual token. StringBuffer buffer = new StringBuffer(80); // Convert the String to a character array. char[] array = s.toCharArray(); // The character (single or double quote) used to start the last quoted section. char quotedChar = DOUBLE_QUOTE; length = array.length; // Loop over the character array. while (i < length) { // Check if we are currently removing whitespace. if (whitespace) { if (isWhitespace(array[i])) { i++; continue; } else { whitespace = false; } } // Check if we are currently escaped. if (escaped) { // Add the next character to the array. buffer.append(array[i++]); // Turn off the character escape. escaped = false; continue; } else { // Check for the escape character. if (array[i] == ESCAPE) { escaped = true; i++; continue; } // Check for the quotation character. if (array[i] == DOUBLE_QUOTE || array[i] == SINGLE_QUOTE) { if (!quoted || quotedChar == array[i]) { quoted = !quoted; quotedChar = array[i]; i++; continue; } } // Check for the end of the token. if (!quoted && isWhitespace(array[i])) { // Add the token and refresh the buffer. tokens.addElement(buffer.toString()); buffer = new StringBuffer(80); // Stop reading the token. whitespace = true; continue; } // Otherwise add the character to the buffer. buffer.append(array[i++]); } } // Add the last token to the vector if there is one. if (! whitespace) tokens.addElement(buffer.toString()); return tokens; } /** * Returns true if the given character is considered to be * whitespace. * @param c the character to test. */ private boolean isWhitespace (char c) { return (c == ' ' || c == '\t' || c == '\n'); } }