jro_apps/redmine Files · vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb

Link to watched issues list on my page....

Link to watched issues list on my page. git-svn-id: svn+ssh://rubyforge.org/var/svn/redmine/trunk@2457 e93f8b46-1217-0410-a6f0-8f06a7374b81

Eric Davis - - Load All Authors

File last commit:

r2376:f70be197e0ae


                r2396:5bdd4291624c

Download file

             htmltokenizer.rb
        
                    305 lines
            
             | 7.4 KiB
            
                | text/x-ruby
            
             |
                RubyLexer
            
             / vendor / gems / ruby-openid-2.1.4 / lib / openid / yadis / htmltokenizer.rb
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        Eric Davis
    
Unpacked OpenID gem. #699...

              r2376
            
      # = HTMLTokenizer

      #

      # Author::    Ben Giddings  (mailto:bg-rubyforge@infofiend.com)

      # Copyright:: Copyright (c) 2004 Ben Giddings

      # License::   Distributes under the same terms as Ruby

      #

      #

      # This is a partial port of the functionality behind Perl's TokeParser

      # Provided a page it progressively returns tokens from that page

      #

      # $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $

      #

      # A class to tokenize HTML.

      #

      # Example:

      #

      #   page = "<HTML>

      #   <HEAD>

      #   <TITLE>This is the title</TITLE>

      #   </HEAD>

      #    <!-- Here comes the <a href=\"missing.link\">blah</a>

      #    comment body

      #     -->

      #    <BODY>

      #      <H1>This is the header</H1>

      #      <P>

      #        This is the paragraph, it contains

      #        <a href=\"link.html\">links</a>,

      #        <img src=\"blah.gif\" optional alt='images

      #        are

      #        really cool'>.  Ok, here is some more text and

      #        <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.

      #      </P>

      #    </body>

      #    </HTML>

      #    "

      #    toke = HTMLTokenizer.new(page)

      #

      #    assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)

      #    assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))

      #    assert("links" == toke.getTrimmedText)

      #    assert(toke.getTag("IMG", "A").attr_hash['optional'])

      #    assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])

      #

      class HTMLTokenizer

        @@version = 1.0

        # Get version of HTMLTokenizer lib

        def self.version

          @@version

        end

        attr_reader :page

        # Create a new tokenizer, based on the content, used as a string.

        def initialize(content)

          @page = content.to_s

          @cur_pos = 0

        end

        # Reset the parser, setting the current position back at the stop

        def reset

          @cur_pos = 0

        end

        # Look at the next token, but don't actually grab it

        def peekNextToken

          if @cur_pos == @page.length then return nil end

          if ?< == @page[@cur_pos]

            # Next token is a tag of some kind

            if '!--' == @page[(@cur_pos + 1), 3]

              # Token is a comment

              tag_end = @page.index('-->', (@cur_pos + 1))

              if tag_end.nil?

                raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"

              end

              # p @page[@cur_pos .. (tag_end+2)]

              HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])

            else

              # Token is a html tag

              tag_end = @page.index('>', (@cur_pos + 1))

              if tag_end.nil?

                raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"

              end

              # p @page[@cur_pos .. tag_end]

              HTMLTag.new(@page[@cur_pos .. tag_end])

            end

          else

            # Next token is text

            text_end = @page.index('<', @cur_pos)

            text_end = text_end.nil? ? -1 : (text_end - 1)

            # p @page[@cur_pos .. text_end]

            HTMLText.new(@page[@cur_pos .. text_end])

          end

        end

        # Get the next token, returns an instance of

        # * HTMLText

        # * HTMLToken

        # * HTMLTag

        def getNextToken

          token = peekNextToken

          if token

            # @page = @page[token.raw.length .. -1]

            # @page.slice!(0, token.raw.length)

            @cur_pos += token.raw.length

          end

          #p token

          #print token.raw

          return token

        end

        # Get a tag from the specified set of desired tags.

        # For example:

        # <tt>foo =  toke.getTag("h1", "h2", "h3")</tt>

        # Will return the next header tag encountered.

        def getTag(*sought_tags)

          sought_tags.collect! {|elm| elm.downcase}

          while (tag = getNextToken)

            if tag.kind_of?(HTMLTag) and

                (0 == sought_tags.length or sought_tags.include?(tag.tag_name))

              break

            end

          end

          tag

        end

        # Get all the text between the current position and the next tag

        # (if specified) or a specific later tag

        def getText(until_tag = nil)

          if until_tag.nil?

            if ?< == @page[@cur_pos]

              # Next token is a tag, not text

              ""

            else

              # Next token is text

              getNextToken.text

            end

          else

            ret_str = ""

            while (tag = peekNextToken)

              if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag

                break

              end

              if ("" != tag.text)

                ret_str << (tag.text + " ")

              end

              getNextToken

            end

            ret_str

          end

        end

        # Like getText, but squeeze all whitespace, getting rid of

        # leading and trailing whitespace, and squeezing multiple

        # spaces into a single space.

        def getTrimmedText(until_tag = nil)

          getText(until_tag).strip.gsub(/\s+/m, " ")

        end

      end

      class HTMLTokenizerError < Exception

      end

      # The parent class for all three types of HTML tokens

      class HTMLToken

        attr_accessor :raw

        # Initialize the token based on the raw text

        def initialize(text)

          @raw = text

        end

        # By default, return exactly the string used to create the text

        def to_s

          raw

        end

        # By default tokens have no text representation

        def text

          ""

        end

        def trimmed_text

          text.strip.gsub(/\s+/m, " ")

        end

        # Compare to another based on the raw source

        def ==(other)

          raw == other.to_s

        end

      end

      # Class representing text that isn't inside a tag

      class HTMLText < HTMLToken

        def text

          raw

        end

      end

      # Class representing an HTML comment

      class HTMLComment < HTMLToken

        attr_accessor :contents

        def initialize(text)

          super(text)

          temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)

          if temp_arr[0].nil?

            raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"

          end

          @contents = temp_arr[0][0]

        end

      end

      # Class representing an HTML tag

      class HTMLTag < HTMLToken

        attr_reader :end_tag, :tag_name

        def initialize(text)

          super(text)

          if ?< != text[0] or ?> != text[-1]

            raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"

          end

          @attr_hash = Hash.new

          @raw = text

          tag_name = text.scan(/[\w:-]+/)[0]

          if tag_name.nil?

            raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"

          end

          if ?/ == text[1]

            # It's an end tag

            @end_tag = true

            @tag_name = '/' + tag_name.downcase

          else

            @end_tag = false

            @tag_name = tag_name.downcase

          end

          @hashed = false

        end

        # Retrieve a hash of all the tag's attributes.

        # Lazily done, so that if you don't look at a tag's attributes

        # things go quicker

        def attr_hash

          # Lazy initialize == don't build the hash until it's needed

          if !@hashed

            if !@end_tag

              # Get the attributes

              attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]

              if attr_arr.kind_of?(Array)

                # Attributes found, parse them

                attrs = attr_arr[0]

                attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)

                # clean up the array by:

                # * setting all nil elements to true

                # * removing enclosing quotes

                attr_arr.each {

                  |item|

                  val = if item[1].nil?

                          item[0]

                        elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]

                          item[1][1 .. -2]

                        else

                          item[1]

                        end

                  @attr_hash[item[0].downcase] = val

                }

              end

            end

            @hashed = true

          end

          #p self

          @attr_hash

        end

        # Get the 'alt' text for a tag, if it exists, or an empty string otherwise

        def text

          if !end_tag

            case tag_name

            when 'img'

              if !attr_hash['alt'].nil?

                return attr_hash['alt']

              end

            when 'applet'

              if !attr_hash['alt'].nil?

                return attr_hash['alt']

              end

            end

          end

          return ''

        end

      end

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

Eric Davis Unpacked OpenID gem. #699...	r2376	# = HTMLTokenizer
		#
		# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
		# Copyright:: Copyright (c) 2004 Ben Giddings
		# License:: Distributes under the same terms as Ruby
		#
		#
		# This is a partial port of the functionality behind Perl's TokeParser
		# Provided a page it progressively returns tokens from that page
		#
		# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $

		#
		# A class to tokenize HTML.
		#
		# Example:
		#
		# page = "<HTML>
		# <HEAD>
		# <TITLE>This is the title</TITLE>
		# </HEAD>
		# <!-- Here comes the <a href=\"missing.link\">blah</a>
		# comment body
		# -->
		# <BODY>
		# <H1>This is the header</H1>
		# <P>
		# This is the paragraph, it contains
		# <a href=\"link.html\">links</a>,
		# <img src=\"blah.gif\" optional alt='images
		# are
		# really cool'>. Ok, here is some more text and
		# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
		# </P>
		# </body>
		# </HTML>
		# "
		# toke = HTMLTokenizer.new(page)
		#
		# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
		# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
		# assert("links" == toke.getTrimmedText)
		# assert(toke.getTag("IMG", "A").attr_hash['optional'])
		# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
		#
		class HTMLTokenizer
		@@version = 1.0

		# Get version of HTMLTokenizer lib
		def self.version
		@@version
		end

		attr_reader :page

		# Create a new tokenizer, based on the content, used as a string.
		def initialize(content)
		@page = content.to_s
		@cur_pos = 0
		end

		# Reset the parser, setting the current position back at the stop
		def reset
		@cur_pos = 0
		end

		# Look at the next token, but don't actually grab it
		def peekNextToken
		if @cur_pos == @page.length then return nil end

		if ?< == @page[@cur_pos]
		# Next token is a tag of some kind
		if '!--' == @page[(@cur_pos + 1), 3]
		# Token is a comment
		tag_end = @page.index('-->', (@cur_pos + 1))
		if tag_end.nil?
		raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
		end
		# p @page[@cur_pos .. (tag_end+2)]
		HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
		else
		# Token is a html tag
		tag_end = @page.index('>', (@cur_pos + 1))
		if tag_end.nil?
		raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
		end
		# p @page[@cur_pos .. tag_end]
		HTMLTag.new(@page[@cur_pos .. tag_end])
		end
		else
		# Next token is text
		text_end = @page.index('<', @cur_pos)
		text_end = text_end.nil? ? -1 : (text_end - 1)
		# p @page[@cur_pos .. text_end]
		HTMLText.new(@page[@cur_pos .. text_end])
		end
		end

		# Get the next token, returns an instance of
		# * HTMLText
		# * HTMLToken
		# * HTMLTag
		def getNextToken
		token = peekNextToken
		if token
		# @page = @page[token.raw.length .. -1]
		# @page.slice!(0, token.raw.length)
		@cur_pos += token.raw.length
		end
		#p token
		#print token.raw
		return token
		end

		# Get a tag from the specified set of desired tags.
		# For example:
		# <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
		# Will return the next header tag encountered.
		def getTag(*sought_tags)
		sought_tags.collect! {\|elm\| elm.downcase}

		while (tag = getNextToken)
		if tag.kind_of?(HTMLTag) and
		(0 == sought_tags.length or sought_tags.include?(tag.tag_name))
		break
		end
		end
		tag
		end

		# Get all the text between the current position and the next tag
		# (if specified) or a specific later tag
		def getText(until_tag = nil)
		if until_tag.nil?
		if ?< == @page[@cur_pos]
		# Next token is a tag, not text
		""
		else
		# Next token is text
		getNextToken.text
		end
		else
		ret_str = ""

		while (tag = peekNextToken)
		if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
		break
		end

		if ("" != tag.text)
		ret_str << (tag.text + " ")
		end
		getNextToken
		end

		ret_str
		end
		end

		# Like getText, but squeeze all whitespace, getting rid of
		# leading and trailing whitespace, and squeezing multiple
		# spaces into a single space.
		def getTrimmedText(until_tag = nil)
		getText(until_tag).strip.gsub(/\s+/m, " ")
		end

		end

		class HTMLTokenizerError < Exception
		end

		# The parent class for all three types of HTML tokens
		class HTMLToken
		attr_accessor :raw

		# Initialize the token based on the raw text
		def initialize(text)
		@raw = text
		end

		# By default, return exactly the string used to create the text
		def to_s
		raw
		end

		# By default tokens have no text representation
		def text
		""
		end

		def trimmed_text
		text.strip.gsub(/\s+/m, " ")
		end

		# Compare to another based on the raw source
		def ==(other)
		raw == other.to_s
		end
		end

		# Class representing text that isn't inside a tag
		class HTMLText < HTMLToken
		def text
		raw
		end
		end

		# Class representing an HTML comment
		class HTMLComment < HTMLToken
		attr_accessor :contents
		def initialize(text)
		super(text)
		temp_arr = text.scan(/^<!--\s(.?)\s*-->$/m)
		if temp_arr[0].nil?
		raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
		end

		@contents = temp_arr[0][0]
		end
		end

		# Class representing an HTML tag
		class HTMLTag < HTMLToken
		attr_reader :end_tag, :tag_name
		def initialize(text)
		super(text)
		if ?< != text[0] or ?> != text[-1]
		raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
		end

		@attr_hash = Hash.new
		@raw = text

		tag_name = text.scan(/[\w:-]+/)[0]
		if tag_name.nil?
		raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"
		end

		if ?/ == text[1]
		# It's an end tag
		@end_tag = true
		@tag_name = '/' + tag_name.downcase
		else
		@end_tag = false
		@tag_name = tag_name.downcase
		end

		@hashed = false
		end

		# Retrieve a hash of all the tag's attributes.
		# Lazily done, so that if you don't look at a tag's attributes
		# things go quicker
		def attr_hash
		# Lazy initialize == don't build the hash until it's needed
		if !@hashed
		if !@end_tag
		# Get the attributes
		attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]
		if attr_arr.kind_of?(Array)
		# Attributes found, parse them
		attrs = attr_arr[0]
		attr_arr = attrs.scan(/\s([\w:-]+)(?:\s=\s("[^"]"\|'[^']'\|([^"'>][^\s>])))?/m)
		# clean up the array by:
		# * setting all nil elements to true
		# * removing enclosing quotes
		attr_arr.each {
		\|item\|
		val = if item[1].nil?
		item[0]
		elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
		item[1][1 .. -2]
		else
		item[1]
		end
		@attr_hash[item[0].downcase] = val
		}
		end
		end
		@hashed = true
		end

		#p self

		@attr_hash
		end

		# Get the 'alt' text for a tag, if it exists, or an empty string otherwise
		def text
		if !end_tag
		case tag_name
		when 'img'
		if !attr_hash['alt'].nil?
		return attr_hash['alt']
		end
		when 'applet'
		if !attr_hash['alt'].nil?
		return attr_hash['alt']
		end
		end
		end
		return ''
		end
		end