ruby.rb
444 lines
| 15.0 KiB
| text/x-ruby
|
RubyLexer
|
r4619 | # encoding: utf-8 | ||
|
r638 | module CodeRay | ||
module Scanners | ||||
# This scanner is really complex, since Ruby _is_ a complex language! | ||||
# | ||||
# It tries to highlight 100% of all common code, | ||||
# and 90% of strange codes. | ||||
# | ||||
# It is optimized for HTML highlighting, and is not very useful for | ||||
# parsing or pretty printing. | ||||
# | ||||
# For now, I think it's better than the scanners in VIM or Syntax, or | ||||
# any highlighter I was able to find, except Caleb's RubyLexer. | ||||
# | ||||
# I hope it's also better than the rdoc/irb lexer. | ||||
class Ruby < Scanner | ||||
include Streamable | ||||
register_for :ruby | ||||
file_extension 'rb' | ||||
helper :patterns | ||||
|
r3478 | |||
if not defined? EncodingError | ||||
EncodingError = Class.new Exception | ||||
end | ||||
|
r638 | |||
private | ||||
def scan_tokens tokens, options | ||||
|
r4619 | if string.respond_to?(:encoding) | ||
unless string.encoding == Encoding::UTF_8 | ||||
self.string = string.encode Encoding::UTF_8, | ||||
:invalid => :replace, :undef => :replace, :replace => '?' | ||||
end | ||||
unicode = false | ||||
else | ||||
unicode = exist?(/[^\x00-\x7f]/) | ||||
end | ||||
|
r638 | last_token_dot = false | ||
value_expected = true | ||||
heredocs = nil | ||||
last_state = nil | ||||
state = :initial | ||||
depth = nil | ||||
inline_block_stack = [] | ||||
|
r4619 | |||
|
r3478 | |||
|
r638 | patterns = Patterns # avoid constant lookup | ||
|
r3478 | |||
|
r638 | until eos? | ||
match = nil | ||||
kind = nil | ||||
if state.instance_of? patterns::StringState | ||||
# {{{ | ||||
match = scan_until(state.pattern) || scan_until(/\z/) | ||||
tokens << [match, :content] unless match.empty? | ||||
break if eos? | ||||
if state.heredoc and self[1] # end of heredoc | ||||
match = getch.to_s | ||||
match << scan_until(/$/) unless eos? | ||||
tokens << [match, :delimiter] | ||||
tokens << [:close, state.type] | ||||
state = state.next_state | ||||
next | ||||
end | ||||
case match = getch | ||||
when state.delim | ||||
if state.paren | ||||
state.paren_depth -= 1 | ||||
if state.paren_depth > 0 | ||||
tokens << [match, :nesting_delimiter] | ||||
next | ||||
end | ||||
end | ||||
tokens << [match, :delimiter] | ||||
if state.type == :regexp and not eos? | ||||
modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox) | ||||
tokens << [modifiers, :modifier] unless modifiers.empty? | ||||
end | ||||
tokens << [:close, state.type] | ||||
value_expected = false | ||||
state = state.next_state | ||||
when '\\' | ||||
if state.interpreted | ||||
if esc = scan(/ #{patterns::ESCAPE} /ox) | ||||
tokens << [match + esc, :char] | ||||
else | ||||
tokens << [match, :error] | ||||
end | ||||
else | ||||
case m = getch | ||||
when state.delim, '\\' | ||||
tokens << [match + m, :char] | ||||
when nil | ||||
tokens << [match, :error] | ||||
else | ||||
tokens << [match + m, :content] | ||||
end | ||||
end | ||||
when '#' | ||||
case peek(1) | ||||
when '{' | ||||
inline_block_stack << [state, depth, heredocs] | ||||
value_expected = true | ||||
state = :initial | ||||
depth = 1 | ||||
tokens << [:open, :inline] | ||||
tokens << [match + getch, :inline_delimiter] | ||||
when '$', '@' | ||||
tokens << [match, :escape] | ||||
last_state = state # scan one token as normal code, then return here | ||||
state = :initial | ||||
else | ||||
raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens | ||||
end | ||||
when state.paren | ||||
state.paren_depth += 1 | ||||
tokens << [match, :nesting_delimiter] | ||||
when /#{patterns::REGEXP_SYMBOLS}/ox | ||||
tokens << [match, :function] | ||||
else | ||||
raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens | ||||
end | ||||
next | ||||
# }}} | ||||
else | ||||
# {{{ | ||||
if match = scan(/[ \t\f]+/) | ||||
kind = :space | ||||
|
r3478 | match << scan(/\s*/) unless eos? || heredocs | ||
value_expected = true if match.index(?\n) | ||||
|
r638 | tokens << [match, kind] | ||
next | ||||
elsif match = scan(/\\?\n/) | ||||
kind = :space | ||||
if match == "\n" | ||||
|
r3478 | value_expected = true | ||
|
r638 | state = :initial if state == :undef_comma_expected | ||
end | ||||
if heredocs | ||||
unscan # heredoc scanning needs \n at start | ||||
state = heredocs.shift | ||||
tokens << [:open, state.type] | ||||
heredocs = nil if heredocs.empty? | ||||
next | ||||
else | ||||
match << scan(/\s*/) unless eos? | ||||
end | ||||
tokens << [match, kind] | ||||
next | ||||
|
r3478 | elsif bol? && match = scan(/\#!.*/) | ||
tokens << [match, :doctype] | ||||
next | ||||
|
r638 | elsif match = scan(/\#.*/) or | ||
( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) ) | ||||
kind = :comment | ||||
tokens << [match, kind] | ||||
next | ||||
elsif state == :initial | ||||
# IDENTS # | ||||
|
r3478 | if match = scan(unicode ? /#{patterns::METHOD_NAME}/uo : | ||
/#{patterns::METHOD_NAME}/o) | ||||
|
r638 | if last_token_dot | ||
kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end | ||||
else | ||||
|
r4619 | if value_expected != :expect_colon && scan(/:(?= )/) | ||
tokens << [match, :key] | ||||
match = ':' | ||||
kind = :operator | ||||
else | ||||
kind = patterns::IDENT_KIND[match] | ||||
if kind == :ident | ||||
if match[/\A[A-Z]/] and not match[/[!?]$/] and not match?(/\(/) | ||||
kind = :constant | ||||
end | ||||
elsif kind == :reserved | ||||
state = patterns::DEF_NEW_STATE[match] | ||||
value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match] | ||||
end | ||||
|
r638 | end | ||
end | ||||
|
r3478 | value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/o) | ||
|
r638 | |||
|
r3478 | elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/o) | ||
|
r638 | kind = :ident | ||
|
r4619 | value_expected = :set if check(unicode ? /#{patterns::VALUE_FOLLOWS}/uo : | ||
/#{patterns::VALUE_FOLLOWS}/o) | ||||
|
r638 | |||
# OPERATORS # | ||||
elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /x) | ||||
if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/ | ||||
value_expected = :set | ||||
end | ||||
last_token_dot = :set if self[1] | ||||
kind = :operator | ||||
unless inline_block_stack.empty? | ||||
case match | ||||
when '{' | ||||
depth += 1 | ||||
when '}' | ||||
depth -= 1 | ||||
if depth == 0 # closing brace of inline block reached | ||||
state, depth, heredocs = inline_block_stack.pop | ||||
|
r3478 | heredocs = nil if heredocs && heredocs.empty? | ||
|
r638 | tokens << [match, :inline_delimiter] | ||
kind = :inline | ||||
match = :close | ||||
end | ||||
end | ||||
end | ||||
elsif match = scan(/ ['"] /mx) | ||||
tokens << [:open, :string] | ||||
kind = :delimiter | ||||
state = patterns::StringState.new :string, match == '"', match # important for streaming | ||||
|
r4619 | elsif match = scan(unicode ? /#{patterns::INSTANCE_VARIABLE}/uo : | ||
/#{patterns::INSTANCE_VARIABLE}/o) | ||||
|
r638 | kind = :instance_variable | ||
elsif value_expected and match = scan(/\//) | ||||
tokens << [:open, :regexp] | ||||
kind = :delimiter | ||||
interpreted = true | ||||
state = patterns::StringState.new :regexp, interpreted, match | ||||
|
r3478 | # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o) | ||
elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/o) : scan(/#{patterns::NUMERIC}/o) | ||||
kind = self[1] ? :float : :integer | ||||
|
r638 | |||
|
r4619 | elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo : | ||
/#{patterns::SYMBOL}/o) | ||||
|
r638 | case delim = match[1] | ||
when ?', ?" | ||||
tokens << [:open, :symbol] | ||||
tokens << [':', :symbol] | ||||
match = delim.chr | ||||
kind = :delimiter | ||||
state = patterns::StringState.new :symbol, delim == ?", match | ||||
else | ||||
kind = :symbol | ||||
end | ||||
|
r4619 | elsif match = scan(/ -[>=]? | [+!~^]=? | [*|&]{1,2}=? | >>? /x) | ||
|
r638 | value_expected = :set | ||
kind = :operator | ||||
|
r4619 | elsif value_expected and match = scan(unicode ? /#{patterns::HEREDOC_OPEN}/uo : | ||
/#{patterns::HEREDOC_OPEN}/o) | ||||
|
r638 | indented = self[1] == '-' | ||
quote = self[3] | ||||
delim = self[quote ? 4 : 2] | ||||
kind = patterns::QUOTE_TO_TYPE[quote] | ||||
tokens << [:open, kind] | ||||
tokens << [match, :delimiter] | ||||
match = :close | ||||
heredoc = patterns::StringState.new kind, quote != '\'', delim, (indented ? :indented : :linestart ) | ||||
heredocs ||= [] # create heredocs if empty | ||||
heredocs << heredoc | ||||
elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/o) | ||||
kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do | ||||
raise_inspect 'Unknown fancy string: %%%p' % k, tokens | ||||
end | ||||
tokens << [:open, kind] | ||||
state = patterns::StringState.new kind, interpreted, self[2] | ||||
kind = :delimiter | ||||
|
r4619 | elsif value_expected and match = scan(unicode ? /#{patterns::CHARACTER}/uo : | ||
/#{patterns::CHARACTER}/o) | ||||
|
r638 | kind = :integer | ||
elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x) | ||||
value_expected = :set | ||||
kind = :operator | ||||
elsif match = scan(/`/) | ||||
if last_token_dot | ||||
kind = :operator | ||||
else | ||||
tokens << [:open, :shell] | ||||
kind = :delimiter | ||||
state = patterns::StringState.new :shell, true, match | ||||
end | ||||
|
r4619 | elsif match = scan(unicode ? /#{patterns::GLOBAL_VARIABLE}/uo : | ||
/#{patterns::GLOBAL_VARIABLE}/o) | ||||
|
r638 | kind = :global_variable | ||
|
r4619 | elsif match = scan(unicode ? /#{patterns::CLASS_VARIABLE}/uo : | ||
/#{patterns::CLASS_VARIABLE}/o) | ||||
|
r638 | kind = :class_variable | ||
else | ||||
|
r4619 | if !unicode && !string.respond_to?(:encoding) | ||
|
r3478 | # check for unicode | ||
debug, $DEBUG = $DEBUG, false | ||||
begin | ||||
if check(/./mu).size > 1 | ||||
# seems like we should try again with unicode | ||||
unicode = true | ||||
end | ||||
rescue | ||||
# bad unicode char; use getch | ||||
ensure | ||||
$DEBUG = debug | ||||
end | ||||
next if unicode | ||||
end | ||||
|
r638 | kind = :error | ||
|
r4619 | match = scan(unicode ? /./mu : /./m) | ||
|
r638 | |||
end | ||||
elsif state == :def_expected | ||||
state = :initial | ||||
|
r3478 | if scan(/self\./) | ||
tokens << ['self', :pre_constant] | ||||
tokens << ['.', :operator] | ||||
end | ||||
if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/uo : | ||||
/(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) | ||||
|
r638 | kind = :method | ||
else | ||||
next | ||||
end | ||||
|
r3478 | elsif state == :module_expected | ||
if match = scan(/<</) | ||||
kind = :operator | ||||
else | ||||
state = :initial | ||||
|
r4619 | if match = scan(unicode ? /(?:#{patterns::IDENT}::)*#{patterns::IDENT}/uo : | ||
/(?:#{patterns::IDENT}::)*#{patterns::IDENT}/o) | ||||
|
r3478 | kind = :class | ||
else | ||||
next | ||||
end | ||||
end | ||||
|
r638 | elsif state == :undef_expected | ||
state = :undef_comma_expected | ||||
|
r4619 | if match = scan(unicode ? /#{patterns::METHOD_NAME_EX}/uo : | ||
/#{patterns::METHOD_NAME_EX}/o) | ||||
|
r638 | kind = :method | ||
|
r4619 | elsif match = scan(unicode ? /#{patterns::SYMBOL}/uo : | ||
/#{patterns::SYMBOL}/o) | ||||
|
r638 | case delim = match[1] | ||
when ?', ?" | ||||
tokens << [:open, :symbol] | ||||
tokens << [':', :symbol] | ||||
match = delim.chr | ||||
kind = :delimiter | ||||
state = patterns::StringState.new :symbol, delim == ?", match | ||||
state.next_state = :undef_comma_expected | ||||
else | ||||
kind = :symbol | ||||
end | ||||
else | ||||
state = :initial | ||||
next | ||||
end | ||||
|
r3478 | elsif state == :alias_expected | ||
match = scan(unicode ? /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/uo : | ||||
/(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/o) | ||||
if match | ||||
tokens << [self[1], (self[1][0] == ?: ? :symbol : :method)] | ||||
tokens << [self[2], :space] | ||||
tokens << [self[3], (self[3][0] == ?: ? :symbol : :method)] | ||||
end | ||||
state = :initial | ||||
next | ||||
|
r638 | elsif state == :undef_comma_expected | ||
if match = scan(/,/) | ||||
kind = :operator | ||||
state = :undef_expected | ||||
else | ||||
state = :initial | ||||
next | ||||
end | ||||
end | ||||
# }}} | ||||
|
r3478 | |||
unless kind == :error | ||||
|
r4619 | if value_expected = value_expected == :set | ||
value_expected = :expect_colon if match == '?' || match == 'when' | ||||
end | ||||
|
r3478 | last_token_dot = last_token_dot == :set | ||
end | ||||
if $CODERAY_DEBUG and not kind | ||||
|
r638 | raise_inspect 'Error token %p in line %d' % | ||
[[match, kind], line], tokens, state | ||||
end | ||||
raise_inspect 'Empty token', tokens unless match | ||||
tokens << [match, kind] | ||||
if last_state | ||||
state = last_state | ||||
last_state = nil | ||||
end | ||||
end | ||||
end | ||||
inline_block_stack << [state] if state.is_a? patterns::StringState | ||||
until inline_block_stack.empty? | ||||
this_block = inline_block_stack.pop | ||||
tokens << [:close, :inline] if this_block.size > 1 | ||||
state = this_block.first | ||||
tokens << [:close, state.type] | ||||
end | ||||
tokens | ||||
end | ||||
end | ||||
end | ||||
end | ||||
# vim:fdm=marker | ||||