Skip to content

Commit 4fa6034

Browse files
Earlopainschneems
authored andcommitted
Fully migrate to prism
It mostly continues to rely on tokens. But for a few things like endless method defs and multiline method continuations it uses AST. These are either very difficult or not possible to find just by checking tokens. Because of multiline method calls, comments now don't need to be trimmed anymore.
1 parent e7eaea5 commit 4fa6034

11 files changed

Lines changed: 119 additions & 309 deletions

File tree

lib/syntax_suggest/api.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@
99

1010
# Prism is the new parser, replacing Ripper
1111
require "prism"
12-
# We need Ripper loaded for `Prism.lex_compat` even if we're using Prism
13-
# for lexing and parsing
14-
require "ripper"
1512

1613
module SyntaxSuggest
1714
# Used to indicate a default value that cannot
@@ -188,7 +185,6 @@ def self.valid?(source)
188185
require_relative "clean_document"
189186

190187
# Helpers
191-
require_relative "lex_all"
192188
require_relative "code_line"
193189
require_relative "code_block"
194190
require_relative "block_expand"
@@ -200,3 +196,5 @@ def self.valid?(source)
200196
require_relative "pathname_from_message"
201197
require_relative "display_invalid_blocks"
202198
require_relative "parse_blocks_from_indent_line"
199+
require_relative "visitor"
200+
require_relative "token"

lib/syntax_suggest/clean_document.rb

Lines changed: 8 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -67,26 +67,9 @@ module SyntaxSuggest
6767
# All of these problems are fixed by joining the whole heredoc into a single
6868
# line.
6969
#
70-
# ## Comments and whitespace
71-
#
72-
# Comments can throw off the way the lexer tells us that the line
73-
# logically belongs with the next line. This is valid ruby but
74-
# results in a different lex output than before:
75-
#
76-
# 1 User.
77-
# 2 where(name: "schneems").
78-
# 3 # Comment here
79-
# 4 first
80-
#
81-
# To handle this we can replace comment lines with empty lines
82-
# and then re-lex the source. This removal and re-lexing preserves
83-
# line index and document size, but generates an easier to work with
84-
# document.
85-
#
8670
class CleanDocument
8771
def initialize(source:)
88-
lines = clean_sweep(source: source)
89-
@document = CodeLine.from_source(lines.join)
72+
@document = CodeLine.from_source(source)
9073
end
9174

9275
# Call all of the document "cleaners"
@@ -110,62 +93,6 @@ def to_s
11093
@document.join
11194
end
11295

113-
# Remove comments
114-
#
115-
# replace with empty newlines
116-
#
117-
# source = <<~'EOM'
118-
# # Comment 1
119-
# puts "hello"
120-
# # Comment 2
121-
# puts "world"
122-
# EOM
123-
#
124-
# lines = CleanDocument.new(source: source).lines
125-
# expect(lines[0].to_s).to eq("\n")
126-
# expect(lines[1].to_s).to eq("puts "hello")
127-
# expect(lines[2].to_s).to eq("\n")
128-
# expect(lines[3].to_s).to eq("puts "world")
129-
#
130-
# Important: This must be done before lexing.
131-
#
132-
# After this change is made, we lex the document because
133-
# removing comments can change how the doc is parsed.
134-
#
135-
# For example:
136-
#
137-
# values = LexAll.new(source: <<~EOM))
138-
# User.
139-
# # comment
140-
# where(name: 'schneems')
141-
# EOM
142-
# expect(
143-
# values.count {|v| v.type == :on_ignored_nl}
144-
# ).to eq(1)
145-
#
146-
# After the comment is removed:
147-
#
148-
# values = LexAll.new(source: <<~EOM))
149-
# User.
150-
#
151-
# where(name: 'schneems')
152-
# EOM
153-
# expect(
154-
# values.count {|v| v.type == :on_ignored_nl}
155-
# ).to eq(2)
156-
#
157-
def clean_sweep(source:)
158-
# Match comments, but not HEREDOC strings with #{variable} interpolation
159-
# https://rubular.com/r/HPwtW9OYxKUHXQ
160-
source.lines.map do |line|
161-
if line.match?(/^\s*#([^{].*|)$/)
162-
$/
163-
else
164-
line
165-
end
166-
end
167-
end
168-
16996
# Smushes all heredoc lines into one line
17097
#
17198
# source = <<~'EOM'
@@ -184,9 +111,9 @@ def join_heredoc!
184111
lines.each do |line|
185112
line.tokens.each do |token|
186113
case token.type
187-
when :on_heredoc_beg
114+
when :HEREDOC_START
188115
start_index_stack << line.index
189-
when :on_heredoc_end
116+
when :HEREDOC_END
190117
start_index = start_index_stack.pop
191118
end_index = line.index
192119
heredoc_beg_end_index << [start_index, end_index]
@@ -212,20 +139,10 @@ def join_heredoc!
212139
# expect(lines[0].to_s).to eq(source)
213140
# expect(lines[1].to_s).to eq("")
214141
#
215-
# The one known case this doesn't handle is:
216-
#
217-
# Ripper.lex <<~EOM
218-
# a &&
219-
# b ||
220-
# c
221-
# EOM
222-
#
223-
# For some reason this introduces `on_ignore_newline` but with BEG type
224-
#
225142
def join_consecutive!
226-
consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line|
143+
consecutive_groups = @document.select(&:consecutive?).map do |code_line|
227144
take_while_including(code_line.index..) do |line|
228-
line.ignore_newline_not_beg?
145+
line.consecutive?
229146
end
230147
end
231148

@@ -275,14 +192,15 @@ def join_groups(groups)
275192
@document[line.index] = CodeLine.new(
276193
tokens: lines.map(&:tokens).flatten,
277194
line: lines.join,
278-
index: line.index
195+
index: line.index,
196+
consecutive: false
279197
)
280198

281199
# Hide the rest of the lines
282200
lines[1..].each do |line|
283201
# The above lines already have newlines in them, if add more
284202
# then there will be double newline, use an empty line instead
285-
@document[line.index] = CodeLine.new(line: "", index: line.index, tokens: [])
203+
@document[line.index] = CodeLine.new(line: "", index: line.index, tokens: [], consecutive: false)
286204
end
287205
end
288206
self

lib/syntax_suggest/code_line.rb

Lines changed: 25 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,33 @@ class CodeLine
2727
# Returns an array of CodeLine objects
2828
# from the source string
2929
def self.from_source(source)
30-
tokens_for_line = LexAll.new(source: source).each_with_object(Hash.new { |h, k| h[k] = [] }) { |token, hash| hash[token.line] << token }
30+
ast, tokens = Prism.parse_lex(source).value
31+
visitor = Visitor.new
32+
visitor.visit(ast)
33+
tokens.sort_by! { |token, _state| token.location.start_line }
34+
35+
prev_token = nil
36+
tokens.map! do |token, _state|
37+
prev_token = Token.new(token, prev_token, visitor)
38+
end
39+
40+
tokens_for_line = tokens.each_with_object(Hash.new { |h, k| h[k] = [] }) { |token, hash| hash[token.line] << token }
3141
source.lines.map.with_index do |line, index|
3242
CodeLine.new(
3343
line: line,
3444
index: index,
35-
tokens: tokens_for_line[index + 1]
45+
tokens: tokens_for_line[index + 1],
46+
consecutive: visitor.consecutive_lines.include?(index + 1)
3647
)
3748
end
3849
end
3950

4051
attr_reader :line, :index, :tokens, :line_number, :indent
41-
def initialize(line:, index:, tokens:)
52+
def initialize(line:, index:, tokens:, consecutive:)
4253
@tokens = tokens
4354
@line = line
4455
@index = index
56+
@consecutive = consecutive
4557
@original = line
4658
@line_number = @index + 1
4759
strip_line = line.dup
@@ -150,91 +162,36 @@ def <=>(other)
150162
index <=> other.index
151163
end
152164

153-
# [Not stable API]
154-
#
155-
# Lines that have a `on_ignored_nl` type token and NOT
156-
# a `BEG` type seem to be a good proxy for the ability
157-
# to join multiple lines into one.
158-
#
159-
# This predicate method is used to determine when those
160-
# two criteria have been met.
161-
#
162-
# The one known case this doesn't handle is:
163-
#
164-
# Ripper.lex <<~EOM
165-
# a &&
166-
# b ||
167-
# c
168-
# EOM
169-
#
170-
# For some reason this introduces `on_ignore_newline` but with BEG type
171-
def ignore_newline_not_beg?
172-
@ignore_newline_not_beg
165+
# Can this line be logically joined together
166+
# with the following line? Determined by walking
167+
# the AST
168+
def consecutive?
169+
@consecutive
173170
end
174171

175-
# Determines if the given line has a trailing slash
172+
# Determines if the given line has a trailing slash.
173+
# Simply check if the line contains a backslash after
174+
# the content of the last token.
176175
#
177176
# lines = CodeLine.from_source(<<~EOM)
178177
# it "foo" \
179178
# EOM
180179
# expect(lines.first.trailing_slash?).to eq(true)
181180
#
182181
def trailing_slash?
183-
last = @tokens.last
184-
185-
# Older versions of prism diverged slightly from Ripper in compatibility mode
186-
case last&.type
187-
when :on_sp
188-
last.value == TRAILING_SLASH
189-
when :on_tstring_end
190-
true
191-
else
192-
false
193-
end
182+
return unless (last = @tokens.last)
183+
@line.byteindex(TRAILING_SLASH, last.location.end_column) != nil
194184
end
195185

196-
# Endless method detection
197-
#
198-
# From https://github.com/ruby/irb/commit/826ae909c9c93a2ddca6f9cfcd9c94dbf53d44ab
199-
# Detecting a "oneliner" seems to need a state machine.
200-
# This can be done by looking mostly at the "state" (last value):
201-
#
202-
# ENDFN -> BEG (token = '=' ) -> END
203-
#
204186
private def set_kw_end
205-
oneliner_count = 0
206-
in_oneliner_def = nil
207-
208187
kw_count = 0
209188
end_count = 0
210189

211-
@ignore_newline_not_beg = false
212190
@tokens.each do |token|
213191
kw_count += 1 if token.is_kw?
214192
end_count += 1 if token.is_end?
215-
216-
if token.type == :on_ignored_nl
217-
@ignore_newline_not_beg = !token.expr_beg?
218-
end
219-
220-
if in_oneliner_def.nil?
221-
in_oneliner_def = :ENDFN if token.state.allbits?(Ripper::EXPR_ENDFN)
222-
elsif token.state.allbits?(Ripper::EXPR_ENDFN)
223-
# Continue
224-
elsif token.state.allbits?(Ripper::EXPR_BEG)
225-
in_oneliner_def = :BODY if token.value == "="
226-
elsif token.state.allbits?(Ripper::EXPR_END)
227-
# We found an endless method, count it
228-
oneliner_count += 1 if in_oneliner_def == :BODY
229-
230-
in_oneliner_def = nil
231-
else
232-
in_oneliner_def = nil
233-
end
234193
end
235194

236-
kw_count -= oneliner_count
237-
238195
@is_kw = (kw_count - end_count) > 0
239196
@is_end = (end_count - kw_count) > 0
240197
end

lib/syntax_suggest/left_right_token_count.rb

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,22 @@ def count_end
4949
#
5050
# Example:
5151
#
52+
# token = CodeLine.from_source("{").first.tokens.first
5253
# left_right = LeftRightTokenCount.new
53-
# left_right.count_token(Token.new(1, :on_lbrace, "{", Ripper::EXPR_BEG))
54+
# left_right.count_token(Token.new(token)
5455
# left_right.count_for_char("{")
5556
# # => 1
5657
# left_right.count_for_char("}")
5758
# # => 0
5859
def count_token(token)
5960
case token.type
60-
when :on_tstring_content
61+
when :STRING_CONTENT
6162
# ^^^
6263
# Means it's a string or a symbol `"{"` rather than being
6364
# part of a data structure (like a hash) `{ a: b }`
6465
# ignore it.
65-
when :on_words_beg, :on_symbols_beg, :on_qwords_beg,
66-
:on_qsymbols_beg, :on_regexp_beg, :on_tstring_beg
66+
when :PERCENT_UPPER_W, :PERCENT_UPPER_I, :PERCENT_LOWER_W,
67+
:PERCENT_LOWER_I, :REGEXP_BEGIN, :STRING_BEGIN
6768
# ^^^
6869
# Handle shorthand syntaxes like `%Q{ i am a string }`
6970
#
@@ -72,25 +73,18 @@ def count_token(token)
7273
# can be used
7374
char = token.value[-1]
7475
@count_for_char[char] += 1 if @count_for_char.key?(char)
75-
when :on_embexpr_beg
76+
when :EMBEXPR_BEGIN
7677
# ^^^
7778
# Embedded string expressions like `"#{foo} <-embed"`
7879
# are parsed with chars:
7980
#
80-
# `#{` as :on_embexpr_beg
81-
# `}` as :on_embexpr_end
82-
#
83-
# We cannot ignore both :on_emb_expr_beg and :on_embexpr_end
84-
# because sometimes the lexer thinks something is an embed
85-
# string end, when it is not like `lol = }` (no clue why).
81+
# `#{` as :EMBEXPR_BEGIN
82+
# `}` as :EMBEXPR_END
8683
#
8784
# When we see `#{` count it as a `{` or we will
8885
# have a mis-match count.
8986
#
90-
case token.value
91-
when "\#{"
92-
@count_for_char["{"] += 1
93-
end
87+
@count_for_char["{"] += 1
9488
else
9589
@end_count += 1 if token.is_end?
9690
@kw_count += 1 if token.is_kw?

0 commit comments

Comments
 (0)