Skip to content

Commit e6871fc

Browse files
authored
Merge pull request #251 from Earlopain/use-native-prism
Use native prism
2 parents e7eaea5 + 87ad865 commit e6871fc

14 files changed

Lines changed: 322 additions & 304 deletions

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
## HEAD (unreleased)
22

3-
- Changed: Changed: Minimum supported Ruby version is now 3.3. (https://github.com/ruby/syntax_suggest/pull/246)
3+
- Changed: Minimum supported Ruby version is now 3.3. (https://github.com/ruby/syntax_suggest/pull/246)
4+
- Changed: Use native prism to analyse. (https://github.com/ruby/syntax_suggest/pull/251)
45

56
## 2.0.3
67

lib/syntax_suggest/api.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@
99

1010
# Prism is the new parser, replacing Ripper
1111
require "prism"
12-
# We need Ripper loaded for `Prism.lex_compat` even if we're using Prism
13-
# for lexing and parsing
14-
require "ripper"
1512

1613
module SyntaxSuggest
1714
# Used to indicate a default value that cannot
@@ -188,7 +185,6 @@ def self.valid?(source)
188185
require_relative "clean_document"
189186

190187
# Helpers
191-
require_relative "lex_all"
192188
require_relative "code_line"
193189
require_relative "code_block"
194190
require_relative "block_expand"
@@ -200,3 +196,5 @@ def self.valid?(source)
200196
require_relative "pathname_from_message"
201197
require_relative "display_invalid_blocks"
202198
require_relative "parse_blocks_from_indent_line"
199+
require_relative "visitor"
200+
require_relative "token"

lib/syntax_suggest/clean_document.rb

Lines changed: 8 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -66,27 +66,9 @@ module SyntaxSuggest
6666
#
6767
# All of these problems are fixed by joining the whole heredoc into a single
6868
# line.
69-
#
70-
# ## Comments and whitespace
71-
#
72-
# Comments can throw off the way the lexer tells us that the line
73-
# logically belongs with the next line. This is valid ruby but
74-
# results in a different lex output than before:
75-
#
76-
# 1 User.
77-
# 2 where(name: "schneems").
78-
# 3 # Comment here
79-
# 4 first
80-
#
81-
# To handle this we can replace comment lines with empty lines
82-
# and then re-lex the source. This removal and re-lexing preserves
83-
# line index and document size, but generates an easier to work with
84-
# document.
85-
#
8669
class CleanDocument
8770
def initialize(source:)
88-
lines = clean_sweep(source: source)
89-
@document = CodeLine.from_source(lines.join)
71+
@document = CodeLine.from_source(source)
9072
end
9173

9274
# Call all of the document "cleaners"
@@ -110,62 +92,6 @@ def to_s
11092
@document.join
11193
end
11294

113-
# Remove comments
114-
#
115-
# replace with empty newlines
116-
#
117-
# source = <<~'EOM'
118-
# # Comment 1
119-
# puts "hello"
120-
# # Comment 2
121-
# puts "world"
122-
# EOM
123-
#
124-
# lines = CleanDocument.new(source: source).lines
125-
# expect(lines[0].to_s).to eq("\n")
126-
# expect(lines[1].to_s).to eq("puts "hello")
127-
# expect(lines[2].to_s).to eq("\n")
128-
# expect(lines[3].to_s).to eq("puts "world")
129-
#
130-
# Important: This must be done before lexing.
131-
#
132-
# After this change is made, we lex the document because
133-
# removing comments can change how the doc is parsed.
134-
#
135-
# For example:
136-
#
137-
# values = LexAll.new(source: <<~EOM))
138-
# User.
139-
# # comment
140-
# where(name: 'schneems')
141-
# EOM
142-
# expect(
143-
# values.count {|v| v.type == :on_ignored_nl}
144-
# ).to eq(1)
145-
#
146-
# After the comment is removed:
147-
#
148-
# values = LexAll.new(source: <<~EOM))
149-
# User.
150-
#
151-
# where(name: 'schneems')
152-
# EOM
153-
# expect(
154-
# values.count {|v| v.type == :on_ignored_nl}
155-
# ).to eq(2)
156-
#
157-
def clean_sweep(source:)
158-
# Match comments, but not HEREDOC strings with #{variable} interpolation
159-
# https://rubular.com/r/HPwtW9OYxKUHXQ
160-
source.lines.map do |line|
161-
if line.match?(/^\s*#([^{].*|)$/)
162-
$/
163-
else
164-
line
165-
end
166-
end
167-
end
168-
16995
# Smushes all heredoc lines into one line
17096
#
17197
# source = <<~'EOM'
@@ -184,9 +110,9 @@ def join_heredoc!
184110
lines.each do |line|
185111
line.tokens.each do |token|
186112
case token.type
187-
when :on_heredoc_beg
113+
when :HEREDOC_START
188114
start_index_stack << line.index
189-
when :on_heredoc_end
115+
when :HEREDOC_END
190116
start_index = start_index_stack.pop
191117
end_index = line.index
192118
heredoc_beg_end_index << [start_index, end_index]
@@ -212,20 +138,10 @@ def join_heredoc!
212138
# expect(lines[0].to_s).to eq(source)
213139
# expect(lines[1].to_s).to eq("")
214140
#
215-
# The one known case this doesn't handle is:
216-
#
217-
# Ripper.lex <<~EOM
218-
# a &&
219-
# b ||
220-
# c
221-
# EOM
222-
#
223-
# For some reason this introduces `on_ignore_newline` but with BEG type
224-
#
225141
def join_consecutive!
226-
consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line|
142+
consecutive_groups = @document.select(&:consecutive?).map do |code_line|
227143
take_while_including(code_line.index..) do |line|
228-
line.ignore_newline_not_beg?
144+
line.consecutive?
229145
end
230146
end
231147

@@ -275,14 +191,15 @@ def join_groups(groups)
275191
@document[line.index] = CodeLine.new(
276192
tokens: lines.map(&:tokens).flatten,
277193
line: lines.join,
278-
index: line.index
194+
index: line.index,
195+
consecutive: false
279196
)
280197

281198
# Hide the rest of the lines
282199
lines[1..].each do |line|
283200
# The above lines already have newlines in them, if add more
284201
# then there will be double newline, use an empty line instead
285-
@document[line.index] = CodeLine.new(line: "", index: line.index, tokens: [])
202+
@document[line.index] = CodeLine.new(line: "", index: line.index, tokens: [], consecutive: false)
286203
end
287204
end
288205
self

lib/syntax_suggest/code_line.rb

Lines changed: 48 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,56 @@ class CodeLine
2727
# Returns an array of CodeLine objects
2828
# from the source string
2929
def self.from_source(source)
30-
tokens_for_line = LexAll.new(source: source).each_with_object(Hash.new { |h, k| h[k] = [] }) { |token, hash| hash[token.line] << token }
30+
source = +source
31+
parse_result = Prism.parse_lex(source)
32+
ast, tokens = parse_result.value
33+
34+
clean_comments!(source, parse_result.comments)
35+
36+
visitor = Visitor.new
37+
visitor.visit(ast)
38+
tokens.sort_by! { |token, _state| token.location.start_line }
39+
40+
prev_token = nil
41+
tokens.map! do |token, _state|
42+
prev_token = Token.new(token, prev_token, visitor)
43+
end
44+
45+
tokens_for_line = tokens.each_with_object(Hash.new { |h, k| h[k] = [] }) { |token, hash| hash[token.line] << token }
3146
source.lines.map.with_index do |line, index|
3247
CodeLine.new(
3348
line: line,
3449
index: index,
35-
tokens: tokens_for_line[index + 1]
50+
tokens: tokens_for_line[index + 1],
51+
consecutive: visitor.consecutive_lines.include?(index + 1)
3652
)
3753
end
3854
end
3955

56+
# Remove comments that apear on their own in source. They will never be the cause
57+
# of syntax errors and are just visual noise. Example:
58+
#
59+
# source = +<<~RUBY
60+
# # Comment-only line
61+
# foo # Inline comment
62+
# RUBY
63+
# CodeLine.clean_comments!(source, Prism.parse(source).comments)
64+
# source # => "\nfoo # Inline comment\n"
65+
def self.clean_comments!(source, comments)
66+
# Iterate backwards since we are modifying the source in place and must preserve
67+
# the offsets. Prism comments are sorted by their location in the source.
68+
comments.reverse_each do |comment|
69+
next if comment.trailing?
70+
source.bytesplice(comment.location.start_offset, comment.location.length, "")
71+
end
72+
end
73+
4074
attr_reader :line, :index, :tokens, :line_number, :indent
41-
def initialize(line:, index:, tokens:)
75+
def initialize(line:, index:, tokens:, consecutive:)
4276
@tokens = tokens
4377
@line = line
4478
@index = index
79+
@consecutive = consecutive
4580
@original = line
4681
@line_number = @index + 1
4782
strip_line = line.dup
@@ -150,91 +185,36 @@ def <=>(other)
150185
index <=> other.index
151186
end
152187

153-
# [Not stable API]
154-
#
155-
# Lines that have a `on_ignored_nl` type token and NOT
156-
# a `BEG` type seem to be a good proxy for the ability
157-
# to join multiple lines into one.
158-
#
159-
# This predicate method is used to determine when those
160-
# two criteria have been met.
161-
#
162-
# The one known case this doesn't handle is:
163-
#
164-
# Ripper.lex <<~EOM
165-
# a &&
166-
# b ||
167-
# c
168-
# EOM
169-
#
170-
# For some reason this introduces `on_ignore_newline` but with BEG type
171-
def ignore_newline_not_beg?
172-
@ignore_newline_not_beg
188+
# Can this line be logically joined together
189+
# with the following line? Determined by walking
190+
# the AST
191+
def consecutive?
192+
@consecutive
173193
end
174194

175-
# Determines if the given line has a trailing slash
195+
# Determines if the given line has a trailing slash.
196+
# Simply check if the line contains a backslash after
197+
# the content of the last token.
176198
#
177199
# lines = CodeLine.from_source(<<~EOM)
178200
# it "foo" \
179201
# EOM
180202
# expect(lines.first.trailing_slash?).to eq(true)
181203
#
182204
def trailing_slash?
183-
last = @tokens.last
184-
185-
# Older versions of prism diverged slightly from Ripper in compatibility mode
186-
case last&.type
187-
when :on_sp
188-
last.value == TRAILING_SLASH
189-
when :on_tstring_end
190-
true
191-
else
192-
false
193-
end
205+
return unless (last = @tokens.last)
206+
@line.byteindex(TRAILING_SLASH, last.location.end_column) != nil
194207
end
195208

196-
# Endless method detection
197-
#
198-
# From https://github.com/ruby/irb/commit/826ae909c9c93a2ddca6f9cfcd9c94dbf53d44ab
199-
# Detecting a "oneliner" seems to need a state machine.
200-
# This can be done by looking mostly at the "state" (last value):
201-
#
202-
# ENDFN -> BEG (token = '=' ) -> END
203-
#
204209
private def set_kw_end
205-
oneliner_count = 0
206-
in_oneliner_def = nil
207-
208210
kw_count = 0
209211
end_count = 0
210212

211-
@ignore_newline_not_beg = false
212213
@tokens.each do |token|
213214
kw_count += 1 if token.is_kw?
214215
end_count += 1 if token.is_end?
215-
216-
if token.type == :on_ignored_nl
217-
@ignore_newline_not_beg = !token.expr_beg?
218-
end
219-
220-
if in_oneliner_def.nil?
221-
in_oneliner_def = :ENDFN if token.state.allbits?(Ripper::EXPR_ENDFN)
222-
elsif token.state.allbits?(Ripper::EXPR_ENDFN)
223-
# Continue
224-
elsif token.state.allbits?(Ripper::EXPR_BEG)
225-
in_oneliner_def = :BODY if token.value == "="
226-
elsif token.state.allbits?(Ripper::EXPR_END)
227-
# We found an endless method, count it
228-
oneliner_count += 1 if in_oneliner_def == :BODY
229-
230-
in_oneliner_def = nil
231-
else
232-
in_oneliner_def = nil
233-
end
234216
end
235217

236-
kw_count -= oneliner_count
237-
238218
@is_kw = (kw_count - end_count) > 0
239219
@is_end = (end_count - kw_count) > 0
240220
end

lib/syntax_suggest/left_right_token_count.rb

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,22 @@ def count_end
4949
#
5050
# Example:
5151
#
52+
# token = CodeLine.from_source("{").first.tokens.first
5253
# left_right = LeftRightTokenCount.new
53-
# left_right.count_token(Token.new(1, :on_lbrace, "{", Ripper::EXPR_BEG))
54+
# left_right.count_token(Token.new(token)
5455
# left_right.count_for_char("{")
5556
# # => 1
5657
# left_right.count_for_char("}")
5758
# # => 0
5859
def count_token(token)
5960
case token.type
60-
when :on_tstring_content
61+
when :STRING_CONTENT
6162
# ^^^
6263
# Means it's a string or a symbol `"{"` rather than being
6364
# part of a data structure (like a hash) `{ a: b }`
6465
# ignore it.
65-
when :on_words_beg, :on_symbols_beg, :on_qwords_beg,
66-
:on_qsymbols_beg, :on_regexp_beg, :on_tstring_beg
66+
when :PERCENT_UPPER_W, :PERCENT_UPPER_I, :PERCENT_LOWER_W,
67+
:PERCENT_LOWER_I, :REGEXP_BEGIN, :STRING_BEGIN
6768
# ^^^
6869
# Handle shorthand syntaxes like `%Q{ i am a string }`
6970
#
@@ -72,25 +73,18 @@ def count_token(token)
7273
# can be used
7374
char = token.value[-1]
7475
@count_for_char[char] += 1 if @count_for_char.key?(char)
75-
when :on_embexpr_beg
76+
when :EMBEXPR_BEGIN
7677
# ^^^
7778
# Embedded string expressions like `"#{foo} <-embed"`
7879
# are parsed with chars:
7980
#
80-
# `#{` as :on_embexpr_beg
81-
# `}` as :on_embexpr_end
82-
#
83-
# We cannot ignore both :on_emb_expr_beg and :on_embexpr_end
84-
# because sometimes the lexer thinks something is an embed
85-
# string end, when it is not like `lol = }` (no clue why).
81+
# `#{` as :EMBEXPR_BEGIN
82+
# `}` as :EMBEXPR_END
8683
#
8784
# When we see `#{` count it as a `{` or we will
8885
# have a mis-match count.
8986
#
90-
case token.value
91-
when "\#{"
92-
@count_for_char["{"] += 1
93-
end
87+
@count_for_char["{"] += 1
9488
else
9589
@end_count += 1 if token.is_end?
9690
@kw_count += 1 if token.is_kw?

0 commit comments

Comments
 (0)