Skip to content

Commit 479005d

Browse files
author
Sebastian Kaupper
committed
Add support for a single quote in a character literal and a double quote in a string literal
1 parent d762084 commit 479005d

2 files changed

Lines changed: 61 additions & 18 deletions

File tree

pyVHDLParser/Token/Parser.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,14 @@ class TokenKind(IntEnum):
6666
PossibleRealLiteral = 7 #: Last char was a ``.``
6767
PossibleCharacterLiteral = 8 #: Last char was a ``'``
6868
PossibleStringLiteralStart = 9 #: Last char was a ``"``
69-
PossibleExtendedIdentifierStart = 10 #: Last char was a ``\``
70-
SingleLineComment = 11 #: Found ``--`` before
71-
MultiLineComment = 12 #: Found ``/*`` before
72-
Linebreak = 13 #: Last char was a ``\n``
73-
Directive = 14 #: Last char was a `` ` ``
74-
FuseableCharacter = 15 #: Last char was a character that could be fused
75-
OtherChars = 16 #: Anything else
69+
PossibleStringLiteralEnd = 10 #: Last char was a ``"`` while being in state ``PossibleStringLiteralStart``
70+
PossibleExtendedIdentifierStart = 11 #: Last char was a ``\``
71+
SingleLineComment = 12 #: Found ``--`` before
72+
MultiLineComment = 13 #: Found ``/*`` before
73+
Linebreak = 14 #: Last char was a ``\n``
74+
Directive = 15 #: Last char was a `` ` ``
75+
FuseableCharacter = 16 #: Last char was a character that could be fused
76+
OtherChars = 17 #: Anything else
7677

7778
@classmethod
7879
def GetVHDLTokenizer(cls, iterable: Iterator[str]):
@@ -336,24 +337,23 @@ def GetVHDLTokenizer(cls, iterable: Iterator[str]):
336337
buffer += char
337338
if len(buffer) == 2:
338339
if buffer[1] == "(" and isinstance(previousToken, WordToken):
340+
# An input of the form `<word>'(` always must be a qualified expression
341+
# in order to be valid VHDL. There is not case where `'('` would be a valid character literal
342+
# if preceeded by a word token
339343
previousToken = CharacterToken(previousToken, "'", start)
340344
yield previousToken
341345
previousToken = CharacterToken(previousToken, "(", SourceCodePosition(row, column, absolute))
342346
yield previousToken
343347
tokenKind = cls.TokenKind.OtherChars
344-
elif buffer[1] == "'":
345-
previousToken = CharacterToken(previousToken, "'", start)
346-
yield previousToken
347-
previousToken = CharacterToken(previousToken, "'", SourceCodePosition(row, column, absolute))
348-
yield previousToken
349-
tokenKind = cls.TokenKind.OtherChars
350348
else:
351349
continue
352350
elif (len(buffer) == 3) and (buffer[2] == "'"):
351+
# Whatever is enclosed in single quotes, is the content of a character literal
353352
previousToken = CharacterLiteralToken(previousToken, buffer, start, SourceCodePosition(row, column, absolute))
354353
yield previousToken
355354
tokenKind = cls.TokenKind.OtherChars
356355
else:
356+
# If the third entry of the buffer is not a closing single quote, the single quote must belong to an attribute
357357
previousToken = CharacterToken(previousToken, "'", start)
358358
yield previousToken
359359

@@ -371,9 +371,41 @@ def GetVHDLTokenizer(cls, iterable: Iterator[str]):
371371
elif tokenKind is cls.TokenKind.PossibleStringLiteralStart:
372372
buffer += char
373373
if char == "\"":
374-
previousToken = StringLiteralToken(previousToken, buffer, start, SourceCodePosition(row, column, absolute))
374+
tokenKind = cls.TokenKind.PossibleStringLiteralEnd
375+
376+
# State: PossibleStringLiteralEnd
377+
elif tokenKind is cls.TokenKind.PossibleStringLiteralEnd:
378+
if char == "\"":
379+
buffer += char
380+
tokenKind = cls.TokenKind.PossibleStringLiteralStart
381+
else:
382+
previousToken = StringLiteralToken(previousToken, buffer, start, SourceCodePosition(row, column-1, absolute-1))
375383
yield previousToken
376-
tokenKind = cls.TokenKind.OtherChars
384+
385+
start = SourceCodePosition(row, column, absolute)
386+
buffer = char
387+
if char in __WHITESPACE_CHARACTERS__: tokenKind = cls.TokenKind.SpaceChars
388+
elif char in __NUMBER_CHARACTERS__: tokenKind = cls.TokenKind.IntegerChars
389+
elif char in __ALPHA_CHARACTERS__: tokenKind = cls.TokenKind.AlphaChars
390+
elif char == "'": tokenKind = cls.TokenKind.PossibleCharacterLiteral
391+
elif char == "\"": tokenKind = cls.TokenKind.PossibleStringLiteralStart
392+
elif char == "-": tokenKind = cls.TokenKind.PossibleSingleLineCommentStart
393+
elif char == "\r": tokenKind = cls.TokenKind.PossibleLinebreak
394+
elif char == "\n":
395+
previousToken = LinebreakToken(previousToken, char, start, start)
396+
yield previousToken
397+
tokenKind = cls.TokenKind.OtherChars
398+
elif char in __FUSEABLE_CHARS__:
399+
buffer = char
400+
tokenKind = cls.TokenKind.FuseableCharacter
401+
elif char == ".": tokenKind = cls.TokenKind.PossibleRealLiteral
402+
elif char == "\\": tokenKind = cls.TokenKind.PossibleExtendedIdentifierStart
403+
elif (char == "`") and isinstance(previousToken, (WhitespaceToken, LinebreakToken)):
404+
tokenKind = cls.TokenKind.Directive
405+
else:
406+
previousToken = CharacterToken(previousToken, char, start)
407+
yield previousToken
408+
tokenKind = cls.TokenKind.OtherChars
377409

378410
# State: PossibleExtendedIdentifierStart
379411
elif tokenKind is cls.TokenKind.PossibleExtendedIdentifierStart:

tests/unit/Tokenizer/Tokens.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ class Sequence_1(TestCase, ExpectedDataMixin, TokenSequence):
130130
)
131131

132132
class Sequence_2(TestCase, ExpectedDataMixin, TokenSequence):
133-
code = """abc \\def\\ \t 'a' "abc" /* help */ -- foo\n """
133+
code = """abc \\def\\ \t 'a' ''' "abc" "\"\"" "foo\"\"" /* help */ -- foo\n """
134134
tokenStream = ExpectedTokenStream(
135135
[(StartOfDocumentToken, None),
136136
(WordToken, "abc"),
@@ -139,8 +139,14 @@ class Sequence_2(TestCase, ExpectedDataMixin, TokenSequence):
139139
(WhitespaceToken, " \t "),
140140
(CharacterLiteralToken, "a"),
141141
(WhitespaceToken, " "),
142+
(CharacterLiteralToken, "'"),
143+
(WhitespaceToken, " "),
142144
(StringLiteralToken, "abc"),
143145
(WhitespaceToken, " "),
146+
(StringLiteralToken, "\"\""),
147+
(WhitespaceToken, " "),
148+
(StringLiteralToken, "foo\"\""),
149+
(WhitespaceToken, " "),
144150
(MultiLineCommentToken, "/* help */"),
145151
(WhitespaceToken, " "),
146152
(SingleLineCommentToken, "-- foo\n"),
@@ -303,7 +309,7 @@ class Sequence_6(TestCase, ExpectedDataMixin, TokenSequence):
303309

304310

305311
class Sequence_7(TestCase, ExpectedDataMixin, TokenSequence):
306-
code = """constant BIT_STRING : UNSIGNED(0 downto 0) := UNSIGNED'(x\"0\");\nconstant LPAREN_CHAR : character := '(';\nfoo'('0')\nbar'('(')"""
312+
code = """constant BIT_STRING : UNSIGNED(0 downto 0) := UNSIGNED'(x\"0\");\nconstant LPAREN_CHAR : character := '(';\nfoo'('0')\nbar'('(')\ncharacter'(''')"""
307313
tokenStream = ExpectedTokenStream(
308314
[ (StartOfDocumentToken, None),
309315
(WordToken, "constant"),
@@ -355,11 +361,16 @@ class Sequence_7(TestCase, ExpectedDataMixin, TokenSequence):
355361
(CharacterToken, "("),
356362
(CharacterLiteralToken, "("),
357363
(CharacterToken, ")"),
364+
(LinebreakToken, None),
365+
(WordToken, "character"),
366+
(CharacterToken, "'"),
367+
(CharacterToken, "("),
368+
(CharacterLiteralToken, "'"),
369+
(CharacterToken, ")"),
358370
(EndOfDocumentToken, None)
359371
]
360372
)
361373

362-
363374
class Tokenizer_ExceptionInKeyword(TestCase, ExpectedDataMixin, TokenSequence):
364375
code = """keyword"""
365376
tokenStream = ExpectedTokenStream(

0 commit comments

Comments
 (0)