jro_apps/redmine Files · vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb

Adds ability to filter watched issues (#846)....

Adds ability to filter watched issues (#846). git-svn-id: svn+ssh://rubyforge.org/var/svn/redmine/trunk@2456 e93f8b46-1217-0410-a6f0-8f06a7374b81

Eric Davis - - Load All Authors

File last commit:

r2376:f70be197e0ae


                r2395:e1b828de95ec

Download file

             htmltokenizer.rb
        
                    305 lines
            
             | 7.4 KiB
            
                | text/x-ruby
            
             |
                RubyLexer
            
             / vendor / gems / ruby-openid-2.1.4 / lib / openid / yadis / htmltokenizer.rb
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      # = HTMLTokenizer

      #

      # Author::    Ben Giddings  (mailto:bg-rubyforge@infofiend.com)

      # Copyright:: Copyright (c) 2004 Ben Giddings

      # License::   Distributes under the same terms as Ruby

      #

      #

      # This is a partial port of the functionality behind Perl's TokeParser

      # Provided a page it progressively returns tokens from that page

      #

      # $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $

      #

      # A class to tokenize HTML.

      #

      # Example:

      #

      #   page = "<HTML>

      #   <HEAD>

      #   <TITLE>This is the title</TITLE>

      #   </HEAD>

      #    <!-- Here comes the <a href=\"missing.link\">blah</a>

      #    comment body

      #     -->

      #    <BODY>

      #      <H1>This is the header</H1>

      #      <P>

      #        This is the paragraph, it contains

      #        <a href=\"link.html\">links</a>,

      #        <img src=\"blah.gif\" optional alt='images

      #        are

      #        really cool'>.  Ok, here is some more text and

      #        <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.

      #      </P>

      #    </body>

      #    </HTML>

      #    "

      #    toke = HTMLTokenizer.new(page)

      #

      #    assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)

      #    assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))

      #    assert("links" == toke.getTrimmedText)

      #    assert(toke.getTag("IMG", "A").attr_hash['optional'])

      #    assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])

      #

      class HTMLTokenizer

        @@version = 1.0

        # Get version of HTMLTokenizer lib

        def self.version

          @@version

        end

        attr_reader :page

        # Create a new tokenizer, based on the content, used as a string.

        def initialize(content)

          @page = content.to_s

          @cur_pos = 0

        end

        # Reset the parser, setting the current position back at the stop

        def reset

          @cur_pos = 0

        end

        # Look at the next token, but don't actually grab it

        def peekNextToken

          if @cur_pos == @page.length then return nil end

          if ?< == @page[@cur_pos]

            # Next token is a tag of some kind

            if '!--' == @page[(@cur_pos + 1), 3]

              # Token is a comment

              tag_end = @page.index('-->', (@cur_pos + 1))

              if tag_end.nil?

                raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"

              end

              # p @page[@cur_pos .. (tag_end+2)]

              HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])

            else

              # Token is a html tag

              tag_end = @page.index('>', (@cur_pos + 1))

              if tag_end.nil?

                raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"

              end

              # p @page[@cur_pos .. tag_end]

              HTMLTag.new(@page[@cur_pos .. tag_end])

            end

          else

            # Next token is text

            text_end = @page.index('<', @cur_pos)

            text_end = text_end.nil? ? -1 : (text_end - 1)

            # p @page[@cur_pos .. text_end]

            HTMLText.new(@page[@cur_pos .. text_end])

          end

        end

        # Get the next token, returns an instance of

        # * HTMLText

        # * HTMLToken

        # * HTMLTag

        def getNextToken

          token = peekNextToken

          if token

            # @page = @page[token.raw.length .. -1]

            # @page.slice!(0, token.raw.length)

            @cur_pos += token.raw.length

          end

          #p token

          #print token.raw

          return token

        end

        # Get a tag from the specified set of desired tags.

        # For example:

        # <tt>foo =  toke.getTag("h1", "h2", "h3")</tt>

        # Will return the next header tag encountered.

        def getTag(*sought_tags)

          sought_tags.collect! {|elm| elm.downcase}

          while (tag = getNextToken)

            if tag.kind_of?(HTMLTag) and

                (0 == sought_tags.length or sought_tags.include?(tag.tag_name))

              break

            end

          end

          tag

        end

        # Get all the text between the current position and the next tag

        # (if specified) or a specific later tag

        def getText(until_tag = nil)

          if until_tag.nil?

            if ?< == @page[@cur_pos]

              # Next token is a tag, not text

              ""

            else

              # Next token is text

              getNextToken.text

            end

          else

            ret_str = ""

            while (tag = peekNextToken)

              if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag

                break

              end

              if ("" != tag.text)

                ret_str << (tag.text + " ")

              end

              getNextToken

            end

            ret_str

          end

        end

        # Like getText, but squeeze all whitespace, getting rid of

        # leading and trailing whitespace, and squeezing multiple

        # spaces into a single space.

        def getTrimmedText(until_tag = nil)

          getText(until_tag).strip.gsub(/\s+/m, " ")

        end

      end

      class HTMLTokenizerError < Exception

      end

      # The parent class for all three types of HTML tokens

      class HTMLToken

        attr_accessor :raw

        # Initialize the token based on the raw text

        def initialize(text)

          @raw = text

        end

        # By default, return exactly the string used to create the text

        def to_s

          raw

        end

        # By default tokens have no text representation

        def text

          ""

        end

        def trimmed_text

          text.strip.gsub(/\s+/m, " ")

        end

        # Compare to another based on the raw source

        def ==(other)

          raw == other.to_s

        end

      end

      # Class representing text that isn't inside a tag

      class HTMLText < HTMLToken

        def text

          raw

        end

      end

      # Class representing an HTML comment

      class HTMLComment < HTMLToken

        attr_accessor :contents

        def initialize(text)

          super(text)

          temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)

          if temp_arr[0].nil?

            raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"

          end

          @contents = temp_arr[0][0]

        end

      end

      # Class representing an HTML tag

      class HTMLTag < HTMLToken

        attr_reader :end_tag, :tag_name

        def initialize(text)

          super(text)

          if ?< != text[0] or ?> != text[-1]

            raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"

          end

          @attr_hash = Hash.new

          @raw = text

          tag_name = text.scan(/[\w:-]+/)[0]

          if tag_name.nil?

            raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"

          end

          if ?/ == text[1]

            # It's an end tag

            @end_tag = true

            @tag_name = '/' + tag_name.downcase

          else

            @end_tag = false

            @tag_name = tag_name.downcase

          end

          @hashed = false

        end

        # Retrieve a hash of all the tag's attributes.

        # Lazily done, so that if you don't look at a tag's attributes

        # things go quicker

        def attr_hash

          # Lazy initialize == don't build the hash until it's needed

          if !@hashed

            if !@end_tag

              # Get the attributes

              attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]

              if attr_arr.kind_of?(Array)

                # Attributes found, parse them

                attrs = attr_arr[0]

                attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)

                # clean up the array by:

                # * setting all nil elements to true

                # * removing enclosing quotes

                attr_arr.each {

                  |item|

                  val = if item[1].nil?

                          item[0]

                        elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]

                          item[1][1 .. -2]

                        else

                          item[1]

                        end

                  @attr_hash[item[0].downcase] = val

                }

              end

            end

            @hashed = true

          end

          #p self

          @attr_hash

        end

        # Get the 'alt' text for a tag, if it exists, or an empty string otherwise

        def text

          if !end_tag

            case tag_name

            when 'img'

              if !attr_hash['alt'].nil?

                return attr_hash['alt']

              end

            when 'applet'

              if !attr_hash['alt'].nil?

                return attr_hash['alt']

              end

            end

          end

          return ''

        end

      end

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				# = HTMLTokenizer
				#
				# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
				# Copyright:: Copyright (c) 2004 Ben Giddings
				# License:: Distributes under the same terms as Ruby
				#
				#
				# This is a partial port of the functionality behind Perl's TokeParser
				# Provided a page it progressively returns tokens from that page
				#
				# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $

				#
				# A class to tokenize HTML.
				#
				# Example:
				#
				# page = "<HTML>
				# <HEAD>
				# <TITLE>This is the title</TITLE>
				# </HEAD>
				# <!-- Here comes the <a href=\"missing.link\">blah</a>
				# comment body
				# -->
				# <BODY>
				# <H1>This is the header</H1>
				# <P>
				# This is the paragraph, it contains
				# <a href=\"link.html\">links</a>,
				# <img src=\"blah.gif\" optional alt='images
				# are
				# really cool'>. Ok, here is some more text and
				# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
				# </P>
				# </body>
				# </HTML>
				# "
				# toke = HTMLTokenizer.new(page)
				#
				# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
				# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
				# assert("links" == toke.getTrimmedText)
				# assert(toke.getTag("IMG", "A").attr_hash['optional'])
				# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
				#
				class HTMLTokenizer
				@@version = 1.0

				# Get version of HTMLTokenizer lib
				def self.version
				@@version
				end

				attr_reader :page

				# Create a new tokenizer, based on the content, used as a string.
				def initialize(content)
				@page = content.to_s
				@cur_pos = 0
				end

				# Reset the parser, setting the current position back at the stop
				def reset
				@cur_pos = 0
				end

				# Look at the next token, but don't actually grab it
				def peekNextToken
				if @cur_pos == @page.length then return nil end

				if ?< == @page[@cur_pos]
				# Next token is a tag of some kind
				if '!--' == @page[(@cur_pos + 1), 3]
				# Token is a comment
				tag_end = @page.index('-->', (@cur_pos + 1))
				if tag_end.nil?
				raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
				end
				# p @page[@cur_pos .. (tag_end+2)]
				HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
				else
				# Token is a html tag
				tag_end = @page.index('>', (@cur_pos + 1))
				if tag_end.nil?
				raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
				end
				# p @page[@cur_pos .. tag_end]
				HTMLTag.new(@page[@cur_pos .. tag_end])
				end
				else
				# Next token is text
				text_end = @page.index('<', @cur_pos)
				text_end = text_end.nil? ? -1 : (text_end - 1)
				# p @page[@cur_pos .. text_end]
				HTMLText.new(@page[@cur_pos .. text_end])
				end
				end

				# Get the next token, returns an instance of
				# * HTMLText
				# * HTMLToken
				# * HTMLTag
				def getNextToken
				token = peekNextToken
				if token
				# @page = @page[token.raw.length .. -1]
				# @page.slice!(0, token.raw.length)
				@cur_pos += token.raw.length
				end
				#p token
				#print token.raw
				return token
				end

				# Get a tag from the specified set of desired tags.
				# For example:
				# <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
				# Will return the next header tag encountered.
				def getTag(*sought_tags)
				sought_tags.collect! {\|elm\| elm.downcase}

				while (tag = getNextToken)
				if tag.kind_of?(HTMLTag) and
				(0 == sought_tags.length or sought_tags.include?(tag.tag_name))
				break
				end
				end
				tag
				end

				# Get all the text between the current position and the next tag
				# (if specified) or a specific later tag
				def getText(until_tag = nil)
				if until_tag.nil?
				if ?< == @page[@cur_pos]
				# Next token is a tag, not text
				""
				else
				# Next token is text
				getNextToken.text
				end
				else
				ret_str = ""

				while (tag = peekNextToken)
				if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
				break
				end

				if ("" != tag.text)
				ret_str << (tag.text + " ")
				end
				getNextToken
				end

				ret_str
				end
				end

				# Like getText, but squeeze all whitespace, getting rid of
				# leading and trailing whitespace, and squeezing multiple
				# spaces into a single space.
				def getTrimmedText(until_tag = nil)
				getText(until_tag).strip.gsub(/\s+/m, " ")
				end

				end

				class HTMLTokenizerError < Exception
				end

				# The parent class for all three types of HTML tokens
				class HTMLToken
				attr_accessor :raw

				# Initialize the token based on the raw text
				def initialize(text)
				@raw = text
				end

				# By default, return exactly the string used to create the text
				def to_s
				raw
				end

				# By default tokens have no text representation
				def text
				""
				end

				def trimmed_text
				text.strip.gsub(/\s+/m, " ")
				end

				# Compare to another based on the raw source
				def ==(other)
				raw == other.to_s
				end
				end

				# Class representing text that isn't inside a tag
				class HTMLText < HTMLToken
				def text
				raw
				end
				end

				# Class representing an HTML comment
				class HTMLComment < HTMLToken
				attr_accessor :contents
				def initialize(text)
				super(text)
				temp_arr = text.scan(/^<!--\s(.?)\s*-->$/m)
				if temp_arr[0].nil?
				raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
				end

				@contents = temp_arr[0][0]
				end
				end

				# Class representing an HTML tag
				class HTMLTag < HTMLToken
				attr_reader :end_tag, :tag_name
				def initialize(text)
				super(text)
				if ?< != text[0] or ?> != text[-1]
				raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
				end

				@attr_hash = Hash.new
				@raw = text

				tag_name = text.scan(/[\w:-]+/)[0]
				if tag_name.nil?
				raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"
				end

				if ?/ == text[1]
				# It's an end tag
				@end_tag = true
				@tag_name = '/' + tag_name.downcase
				else
				@end_tag = false
				@tag_name = tag_name.downcase
				end

				@hashed = false
				end

				# Retrieve a hash of all the tag's attributes.
				# Lazily done, so that if you don't look at a tag's attributes
				# things go quicker
				def attr_hash
				# Lazy initialize == don't build the hash until it's needed
				if !@hashed
				if !@end_tag
				# Get the attributes
				attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]
				if attr_arr.kind_of?(Array)
				# Attributes found, parse them
				attrs = attr_arr[0]
				attr_arr = attrs.scan(/\s([\w:-]+)(?:\s=\s("[^"]"\|'[^']'\|([^"'>][^\s>])))?/m)
				# clean up the array by:
				# * setting all nil elements to true
				# * removing enclosing quotes
				attr_arr.each {
				\|item\|
				val = if item[1].nil?
				item[0]
				elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
				item[1][1 .. -2]
				else
				item[1]
				end
				@attr_hash[item[0].downcase] = val
				}
				end
				end
				@hashed = true
				end

				#p self

				@attr_hash
				end

				# Get the 'alt' text for a tag, if it exists, or an empty string otherwise
				def text
				if !end_tag
				case tag_name
				when 'img'
				if !attr_hash['alt'].nil?
				return attr_hash['alt']
				end
				when 'applet'
				if !attr_hash['alt'].nil?
				return attr_hash['alt']
				end
				end
				end
				return ''
				end
				end