scanner.rb
298 lines
| 8.0 KiB
| text/x-ruby
|
RubyLexer
|
r638 | module CodeRay | ||
require 'coderay/helpers/plugin' | ||||
# = Scanners | ||||
# | ||||
# This module holds the Scanner class and its subclasses. | ||||
# For example, the Ruby scanner is named CodeRay::Scanners::Ruby | ||||
# can be found in coderay/scanners/ruby. | ||||
# | ||||
# Scanner also provides methods and constants for the register | ||||
# mechanism and the [] method that returns the Scanner class | ||||
# belonging to the given lang. | ||||
# | ||||
# See PluginHost. | ||||
module Scanners | ||||
extend PluginHost | ||||
plugin_path File.dirname(__FILE__), 'scanners' | ||||
require 'strscan' | ||||
# = Scanner | ||||
# | ||||
# The base class for all Scanners. | ||||
# | ||||
# It is a subclass of Ruby's great +StringScanner+, which | ||||
# makes it easy to access the scanning methods inside. | ||||
# | ||||
# It is also +Enumerable+, so you can use it like an Array of | ||||
# Tokens: | ||||
# | ||||
# require 'coderay' | ||||
# | ||||
# c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;" | ||||
# | ||||
# for text, kind in c_scanner | ||||
# puts text if kind == :operator | ||||
# end | ||||
# | ||||
# # prints: (*==)++; | ||||
# | ||||
# OK, this is a very simple example :) | ||||
# You can also use +map+, +any?+, +find+ and even +sort_by+, | ||||
# if you want. | ||||
class Scanner < StringScanner | ||||
|
r3478 | |||
|
r638 | extend Plugin | ||
plugin_host Scanners | ||||
# Raised if a Scanner fails while scanning | ||||
ScanError = Class.new(Exception) | ||||
require 'coderay/helpers/word_list' | ||||
# The default options for all scanner classes. | ||||
# | ||||
# Define @default_options for subclasses. | ||||
DEFAULT_OPTIONS = { :stream => false } | ||||
|
r3478 | |||
KINDS_NOT_LOC = [:comment, :doctype] | ||||
|
r638 | |||
class << self | ||||
# Returns if the Scanner can be used in streaming mode. | ||||
def streamable? | ||||
is_a? Streamable | ||||
end | ||||
def normify code | ||||
|
r3478 | code = code.to_s | ||
|
r4619 | if code.respond_to?(:encoding) && (code.encoding.name != 'UTF-8' || !code.valid_encoding?) | ||
code = code.dup | ||||
original_encoding = code.encoding | ||||
code.force_encoding 'Windows-1252' | ||||
unless code.valid_encoding? | ||||
code.force_encoding original_encoding | ||||
if code.encoding.name == 'UTF-8' | ||||
code.encode! 'UTF-16BE', :invalid => :replace, :undef => :replace, :replace => '?' | ||||
end | ||||
code.encode! 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '?' | ||||
|
r3478 | end | ||
end | ||||
code.to_unix | ||||
|
r638 | end | ||
def file_extension extension = nil | ||||
if extension | ||||
@file_extension = extension.to_s | ||||
else | ||||
@file_extension ||= plugin_id.to_s | ||||
end | ||||
|
r3478 | end | ||
|
r638 | |||
end | ||||
=begin | ||||
## Excluded for speed reasons; protected seems to make methods slow. | ||||
# Save the StringScanner methods from being called. | ||||
# This would not be useful for highlighting. | ||||
strscan_public_methods = | ||||
StringScanner.instance_methods - | ||||
StringScanner.ancestors[1].instance_methods | ||||
protected(*strscan_public_methods) | ||||
=end | ||||
# Create a new Scanner. | ||||
# | ||||
# * +code+ is the input String and is handled by the superclass | ||||
# StringScanner. | ||||
# * +options+ is a Hash with Symbols as keys. | ||||
# It is merged with the default options of the class (you can | ||||
# overwrite default options here.) | ||||
# * +block+ is the callback for streamed highlighting. | ||||
# | ||||
# If you set :stream to +true+ in the options, the Scanner uses a | ||||
# TokenStream with the +block+ as callback to handle the tokens. | ||||
# | ||||
# Else, a Tokens object is used. | ||||
def initialize code='', options = {}, &block | ||||
raise "I am only the basic Scanner class. I can't scan "\ | ||||
"anything. :( Use my subclasses." if self.class == Scanner | ||||
|
r3478 | |||
@options = self.class::DEFAULT_OPTIONS.merge options | ||||
|
r638 | |||
super Scanner.normify(code) | ||||
@tokens = options[:tokens] | ||||
if @options[:stream] | ||||
warn "warning in CodeRay::Scanner.new: :stream is set, "\ | ||||
"but no block was given" unless block_given? | ||||
raise NotStreamableError, self unless kind_of? Streamable | ||||
@tokens ||= TokenStream.new(&block) | ||||
else | ||||
warn "warning in CodeRay::Scanner.new: Block given, "\ | ||||
"but :stream is #{@options[:stream]}" if block_given? | ||||
@tokens ||= Tokens.new | ||||
end | ||||
|
r3478 | @tokens.scanner = self | ||
|
r638 | |||
setup | ||||
end | ||||
def reset | ||||
super | ||||
reset_instance | ||||
end | ||||
def string= code | ||||
code = Scanner.normify(code) | ||||
|
r4619 | if defined?(RUBY_DESCRIPTION) && RUBY_DESCRIPTION['rubinius 1.0.1'] | ||
reset_state | ||||
@string = code | ||||
else | ||||
super code | ||||
end | ||||
|
r638 | reset_instance | ||
end | ||||
# More mnemonic accessor name for the input string. | ||||
alias code string | ||||
alias code= string= | ||||
|
r3478 | # Returns the Plugin ID for this scanner. | ||
def lang | ||||
self.class.plugin_id | ||||
end | ||||
|
r638 | # Scans the code and returns all tokens in a Tokens object. | ||
def tokenize new_string=nil, options = {} | ||||
options = @options.merge(options) | ||||
self.string = new_string if new_string | ||||
@cached_tokens = | ||||
if @options[:stream] # :stream must have been set already | ||||
reset unless new_string | ||||
scan_tokens @tokens, options | ||||
@tokens | ||||
else | ||||
scan_tokens @tokens, options | ||||
end | ||||
end | ||||
def tokens | ||||
@cached_tokens ||= tokenize | ||||
end | ||||
# Whether the scanner is in streaming mode. | ||||
def streaming? | ||||
!!@options[:stream] | ||||
end | ||||
# Traverses the tokens. | ||||
def each &block | ||||
raise ArgumentError, | ||||
'Cannot traverse TokenStream.' if @options[:stream] | ||||
tokens.each(&block) | ||||
end | ||||
include Enumerable | ||||
# The current line position of the scanner. | ||||
# | ||||
# Beware, this is implemented inefficiently. It should be used | ||||
# for debugging only. | ||||
def line | ||||
string[0..pos].count("\n") + 1 | ||||
end | ||||
|
r3478 | |||
def column pos = self.pos | ||||
return 0 if pos <= 0 | ||||
string = string() | ||||
if string.respond_to?(:bytesize) && (defined?(@bin_string) || string.bytesize != string.size) | ||||
@bin_string ||= string.dup.force_encoding('binary') | ||||
string = @bin_string | ||||
end | ||||
pos - (string.rindex(?\n, pos) || 0) | ||||
end | ||||
def marshal_dump | ||||
@options | ||||
end | ||||
def marshal_load options | ||||
@options = options | ||||
end | ||||
|
r638 | |||
protected | ||||
# Can be implemented by subclasses to do some initialization | ||||
# that has to be done once per instance. | ||||
# | ||||
# Use reset for initialization that has to be done once per | ||||
# scan. | ||||
def setup | ||||
end | ||||
# This is the central method, and commonly the only one a | ||||
# subclass implements. | ||||
# | ||||
# Subclasses must implement this method; it must return +tokens+ | ||||
# and must only use Tokens#<< for storing scanned tokens! | ||||
def scan_tokens tokens, options | ||||
raise NotImplementedError, | ||||
"#{self.class}#scan_tokens not implemented." | ||||
end | ||||
def reset_instance | ||||
@tokens.clear unless @options[:keep_tokens] | ||||
@cached_tokens = nil | ||||
|
r3478 | @bin_string = nil if defined? @bin_string | ||
|
r638 | end | ||
# Scanner error with additional status information | ||||
def raise_inspect msg, tokens, state = 'No state given!', ambit = 30 | ||||
raise ScanError, <<-EOE % [ | ||||
***ERROR in %s: %s (after %d tokens) | ||||
tokens: | ||||
%s | ||||
|
r3478 | current line: %d column: %d pos: %d | ||
|
r638 | matched: %p state: %p | ||
bol? = %p, eos? = %p | ||||
surrounding code: | ||||
%p ~~ %p | ||||
***ERROR*** | ||||
EOE | ||||
File.basename(caller[0]), | ||||
msg, | ||||
tokens.size, | ||||
tokens.last(10).map { |t| t.inspect }.join("\n"), | ||||
|
r3478 | line, column, pos, | ||
|
r638 | matched, state, bol?, eos?, | ||
|
r3478 | string[pos - ambit, ambit], | ||
string[pos, ambit], | ||||
|
r638 | ] | ||
end | ||||
end | ||||
end | ||||
end | ||||
class String | ||||
# I love this hack. It seems to silence all dos/unix/mac newline problems. | ||||
def to_unix | ||||
if index ?\r | ||||
gsub(/\r\n?/, "\n") | ||||
else | ||||
self | ||||
end | ||||
end | ||||
end | ||||