##// END OF EJS Templates
Link to watched issues list on my page....
Link to watched issues list on my page. git-svn-id: svn+ssh://rubyforge.org/var/svn/redmine/trunk@2457 e93f8b46-1217-0410-a6f0-8f06a7374b81

File last commit:

r2376:f70be197e0ae
r2396:5bdd4291624c
Show More
htmltokenizer.rb
305 lines | 7.4 KiB | text/x-ruby | RubyLexer
# = HTMLTokenizer
#
# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
# Copyright:: Copyright (c) 2004 Ben Giddings
# License:: Distributes under the same terms as Ruby
#
#
# This is a partial port of the functionality behind Perl's TokeParser
# Provided a page it progressively returns tokens from that page
#
# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
#
# A class to tokenize HTML.
#
# Example:
#
# page = "<HTML>
# <HEAD>
# <TITLE>This is the title</TITLE>
# </HEAD>
# <!-- Here comes the <a href=\"missing.link\">blah</a>
# comment body
# -->
# <BODY>
# <H1>This is the header</H1>
# <P>
# This is the paragraph, it contains
# <a href=\"link.html\">links</a>,
# <img src=\"blah.gif\" optional alt='images
# are
# really cool'>. Ok, here is some more text and
# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
# </P>
# </body>
# </HTML>
# "
# toke = HTMLTokenizer.new(page)
#
# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
# assert("links" == toke.getTrimmedText)
# assert(toke.getTag("IMG", "A").attr_hash['optional'])
# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
#
class HTMLTokenizer
@@version = 1.0
# Get version of HTMLTokenizer lib
def self.version
@@version
end
attr_reader :page
# Create a new tokenizer, based on the content, used as a string.
def initialize(content)
@page = content.to_s
@cur_pos = 0
end
# Reset the parser, setting the current position back at the stop
def reset
@cur_pos = 0
end
# Look at the next token, but don't actually grab it
def peekNextToken
if @cur_pos == @page.length then return nil end
if ?< == @page[@cur_pos]
# Next token is a tag of some kind
if '!--' == @page[(@cur_pos + 1), 3]
# Token is a comment
tag_end = @page.index('-->', (@cur_pos + 1))
if tag_end.nil?
raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
end
# p @page[@cur_pos .. (tag_end+2)]
HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
else
# Token is a html tag
tag_end = @page.index('>', (@cur_pos + 1))
if tag_end.nil?
raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
end
# p @page[@cur_pos .. tag_end]
HTMLTag.new(@page[@cur_pos .. tag_end])
end
else
# Next token is text
text_end = @page.index('<', @cur_pos)
text_end = text_end.nil? ? -1 : (text_end - 1)
# p @page[@cur_pos .. text_end]
HTMLText.new(@page[@cur_pos .. text_end])
end
end
# Get the next token, returns an instance of
# * HTMLText
# * HTMLToken
# * HTMLTag
def getNextToken
token = peekNextToken
if token
# @page = @page[token.raw.length .. -1]
# @page.slice!(0, token.raw.length)
@cur_pos += token.raw.length
end
#p token
#print token.raw
return token
end
# Get a tag from the specified set of desired tags.
# For example:
# <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
# Will return the next header tag encountered.
def getTag(*sought_tags)
sought_tags.collect! {|elm| elm.downcase}
while (tag = getNextToken)
if tag.kind_of?(HTMLTag) and
(0 == sought_tags.length or sought_tags.include?(tag.tag_name))
break
end
end
tag
end
# Get all the text between the current position and the next tag
# (if specified) or a specific later tag
def getText(until_tag = nil)
if until_tag.nil?
if ?< == @page[@cur_pos]
# Next token is a tag, not text
""
else
# Next token is text
getNextToken.text
end
else
ret_str = ""
while (tag = peekNextToken)
if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
break
end
if ("" != tag.text)
ret_str << (tag.text + " ")
end
getNextToken
end
ret_str
end
end
# Like getText, but squeeze all whitespace, getting rid of
# leading and trailing whitespace, and squeezing multiple
# spaces into a single space.
def getTrimmedText(until_tag = nil)
getText(until_tag).strip.gsub(/\s+/m, " ")
end
end
class HTMLTokenizerError < Exception
end
# The parent class for all three types of HTML tokens
class HTMLToken
attr_accessor :raw
# Initialize the token based on the raw text
def initialize(text)
@raw = text
end
# By default, return exactly the string used to create the text
def to_s
raw
end
# By default tokens have no text representation
def text
""
end
def trimmed_text
text.strip.gsub(/\s+/m, " ")
end
# Compare to another based on the raw source
def ==(other)
raw == other.to_s
end
end
# Class representing text that isn't inside a tag
class HTMLText < HTMLToken
def text
raw
end
end
# Class representing an HTML comment
class HTMLComment < HTMLToken
attr_accessor :contents
def initialize(text)
super(text)
temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
if temp_arr[0].nil?
raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
end
@contents = temp_arr[0][0]
end
end
# Class representing an HTML tag
class HTMLTag < HTMLToken
attr_reader :end_tag, :tag_name
def initialize(text)
super(text)
if ?< != text[0] or ?> != text[-1]
raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
end
@attr_hash = Hash.new
@raw = text
tag_name = text.scan(/[\w:-]+/)[0]
if tag_name.nil?
raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"
end
if ?/ == text[1]
# It's an end tag
@end_tag = true
@tag_name = '/' + tag_name.downcase
else
@end_tag = false
@tag_name = tag_name.downcase
end
@hashed = false
end
# Retrieve a hash of all the tag's attributes.
# Lazily done, so that if you don't look at a tag's attributes
# things go quicker
def attr_hash
# Lazy initialize == don't build the hash until it's needed
if !@hashed
if !@end_tag
# Get the attributes
attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]
if attr_arr.kind_of?(Array)
# Attributes found, parse them
attrs = attr_arr[0]
attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
# clean up the array by:
# * setting all nil elements to true
# * removing enclosing quotes
attr_arr.each {
|item|
val = if item[1].nil?
item[0]
elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
item[1][1 .. -2]
else
item[1]
end
@attr_hash[item[0].downcase] = val
}
end
end
@hashed = true
end
#p self
@attr_hash
end
# Get the 'alt' text for a tag, if it exists, or an empty string otherwise
def text
if !end_tag
case tag_name
when 'img'
if !attr_hash['alt'].nil?
return attr_hash['alt']
end
when 'applet'
if !attr_hash['alt'].nil?
return attr_hash['alt']
end
end
end
return ''
end
end