|
1 | | -using Antlr4.Runtime; |
2 | | -using System.Text; |
| 1 | +// ******* GRUN (Grammar Unit Test) for Python ******* |
| 2 | + |
| 3 | +using System; |
3 | 4 | using System.Text.RegularExpressions; |
| 5 | +using System.Text; |
| 6 | +using Antlr4.Runtime; |
4 | 7 |
|
5 | | -/// <summary> |
6 | | -/// GRUN (Grammar Unit Test) for Python |
7 | | -/// </summary> |
8 | | -public class Grun4py |
| 8 | +namespace grun4py |
9 | 9 | { |
10 | | - private const string UTF8 = "utf-8"; |
11 | | - private static readonly byte[] UTF8_BOM = [0xEF, 0xBB, 0xBF]; |
12 | | - private static readonly byte[] UTF32_BE_BOM = [0x00, 0x00, 0xFE, 0xFF]; |
13 | | - private static readonly byte[] UTF32_LE_BOM = [0xFF, 0xFE, 0x00, 0x00]; |
14 | | - private static readonly byte[] UTF16_BE_BOM = [0xFE, 0xFF]; |
15 | | - private static readonly byte[] UTF16_LE_BOM = [0xFF, 0xFE]; |
16 | | - private const int MAX_BOM_LENGTH = 4; |
17 | | - |
18 | | - private const byte MAX_ASCII = 0x7f; |
19 | | - |
20 | | - private static readonly Regex ENCODING_PATTERN = |
21 | | - new(@"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)", RegexOptions.Compiled); |
22 | | - |
23 | | - private static readonly Regex COMMENT_PATTERN = |
24 | | - new(@"^[ \t\f]*(#.*)?$", RegexOptions.Compiled); |
25 | | - |
26 | | - private static readonly Dictionary<string, string> ENCODING_MAP = new() |
27 | | - { |
28 | | - // UTF-8 |
29 | | - { "utf8sig", "utf-8" }, |
30 | | - { "utf8", "utf-8" }, |
31 | | - { "utf", "utf-8" }, |
32 | | - // UTF-16 LE |
33 | | - { "utf16le", "utf-16LE" }, |
34 | | - { "utf16", "utf-16LE" }, |
35 | | - { "ucs2", "utf-16LE" }, |
36 | | - { "ucs2le", "utf-16LE" }, |
37 | | - // ISO-8859-1 / Latin-1 |
38 | | - { "latin1", "iso-8859-1" }, |
39 | | - { "latin", "iso-8859-1" }, |
40 | | - { "iso88591", "iso-8859-1" }, |
41 | | - { "iso8859", "iso-8859-1" }, |
42 | | - { "cp819", "iso-8859-1" }, |
43 | | - // ASCII |
44 | | - { "ascii", "us-ascii" }, |
45 | | - { "usascii", "us-ascii" }, |
46 | | - { "ansiX341968", "us-ascii" }, |
47 | | - { "cp367", "us-ascii" }, |
48 | | - // Deprecated alias |
49 | | - { "binary", "iso-8859-1" }, |
50 | | - }; |
51 | | - |
52 | | - public static int Main(string[] args) |
| 10 | + internal static class Program |
53 | 11 | { |
54 | | - if (args.Length < 1) |
| 12 | + public static int Main(string[] args) |
55 | 13 | { |
56 | | - Console.Error.WriteLine("Error: Please provide an input file path"); |
57 | | - return 1; |
58 | | - } |
59 | | - |
60 | | - string filePath = args[0]; |
61 | | - try |
62 | | - { |
63 | | - string encodingName = DetectEncoding(filePath); |
64 | | - var input = CharStreams.fromPath(filePath, Encoding.GetEncoding(encodingName)); |
| 14 | + if (args.Length < 1) |
| 15 | + { |
| 16 | + Console.Error.WriteLine("Error: Please provide an input file path"); |
| 17 | + return 1; |
| 18 | + } |
65 | 19 |
|
66 | | - PythonLexer lexer = new(input); |
67 | | - lexer.SetEncodingName(encodingName); // generate ENCODING token |
| 20 | + try |
| 21 | + { |
| 22 | + var filePath = args[0]; |
| 23 | + var input = GetEncodedInputStreamByPythonComment(filePath); |
| 24 | + var lexer = new PythonLexer(input); |
| 25 | + var tokens = new CommonTokenStream((ITokenSource)lexer); |
| 26 | + var parser = new PythonParser(tokens); |
| 27 | + |
| 28 | + tokens.Fill(); // Test the lexer grammar |
| 29 | + foreach (IToken t in tokens.GetTokens()) |
| 30 | + { |
| 31 | + Console.WriteLine(GetTokenMetaDataWithTokenName(t)); |
| 32 | + } |
68 | 33 |
|
69 | | - CommonTokenStream tokens = new(lexer); |
70 | | - PythonParser parser = new(tokens); |
| 34 | + parser.file_input(); // Test the parser grammar |
| 35 | + return parser.NumberOfSyntaxErrors; |
71 | 36 |
|
72 | | - tokens.Fill(); |
73 | | - foreach (IToken token in tokens.GetTokens()) |
| 37 | + } |
| 38 | + catch (Exception ex) |
74 | 39 | { |
75 | | - Console.WriteLine(FormatToken(token)); |
| 40 | + Console.Error.WriteLine($"Error: {ex.Message}"); |
| 41 | + return 1; // Error occurred, returning non-zero exit code |
76 | 42 | } |
77 | | - |
78 | | - parser.file_input(); |
79 | | - return parser.NumberOfSyntaxErrors; |
80 | | - } |
81 | | - catch (Exception ex) |
82 | | - { |
83 | | - Console.Error.WriteLine("Error: " + ex.Message); |
84 | | - Console.Error.WriteLine(ex.StackTrace); |
85 | | - return 1; |
86 | 43 | } |
87 | | - } |
88 | | - |
89 | | - // ---------- Token formatting ---------- |
90 | | - |
91 | | - private static string FormatToken(IToken token) |
92 | | - { |
93 | | - string tokenText = EscapeSpecialChars(token.Text); |
94 | | - string tokenName = token.Type == TokenConstants.EOF |
95 | | - ? "EOF" |
96 | | - : PythonLexer.DefaultVocabulary.GetSymbolicName(token.Type); |
97 | 44 |
|
98 | | - string channelName = token.Channel == TokenConstants.DefaultChannel |
99 | | - ? "" |
100 | | - : $"channel={token.Channel},"; |
101 | | - |
102 | | - return $"[@{token.TokenIndex},{token.StartIndex}:{token.StopIndex}='{tokenText}',<{tokenName}>,{channelName}{token.Line}:{token.Column}]"; |
103 | | - } |
104 | | - |
105 | | - private static string EscapeSpecialChars(string text) |
106 | | - { |
107 | | - return text |
108 | | - .Replace("\n", "\\n") |
109 | | - .Replace("\r", "\\r") |
110 | | - .Replace("\t", "\\t") |
111 | | - .Replace("\f", "\\f"); |
112 | | - } |
113 | | - |
114 | | - // ---------- Encoding detection ---------- |
115 | | - |
116 | | - private static string DetectEncoding(string filePath) |
117 | | - { |
118 | | - bool hasUTF8BOM = DetectUTF8BOM(filePath); |
119 | | - string commentEncoding = DetectEncodingFromComments(filePath, hasUTF8BOM); |
120 | | - return ResolveFinalEncoding(filePath, hasUTF8BOM, commentEncoding); |
121 | | - } |
122 | | - |
123 | | - private static bool DetectUTF8BOM(string filePath) |
124 | | - { |
125 | | - byte[] buffer = new byte[MAX_BOM_LENGTH]; |
126 | | - using FileStream stream = File.OpenRead(filePath); |
127 | | - int bytesRead = stream.Read(buffer, 0, MAX_BOM_LENGTH); |
128 | | - |
129 | | - if (BufferStartsWith(buffer, bytesRead, UTF8_BOM)) return true; |
130 | | - if (BufferStartsWith(buffer, bytesRead, UTF32_BE_BOM)) throw BomError(filePath, "UTF-32 BE BOM"); |
131 | | - if (BufferStartsWith(buffer, bytesRead, UTF32_LE_BOM)) throw BomError(filePath, "UTF-32 LE BOM"); |
132 | | - if (BufferStartsWith(buffer, bytesRead, UTF16_BE_BOM)) throw BomError(filePath, "UTF-16 BE BOM"); |
133 | | - if (BufferStartsWith(buffer, bytesRead, UTF16_LE_BOM)) throw BomError(filePath, "UTF-16 LE BOM"); |
134 | | - return false; |
135 | | - } |
136 | | - |
137 | | - private static bool BufferStartsWith(byte[] buffer, int bytesRead, byte[] bom) |
138 | | - { |
139 | | - if (bytesRead < bom.Length) return false; // Not enough bytes to match this BOM |
140 | | - |
141 | | - for (int i = 0; i < bom.Length; i++) |
| 45 | + private static string GetTokenMetaDataWithTokenName(IToken token) |
142 | 46 | { |
143 | | - if (buffer[i] != bom[i]) |
144 | | - return false; |
| 47 | + string tokenText = ReplaceSpecialCharacters(token.Text); |
| 48 | + string tokenName = token.Type == TokenConstants.EOF ? "EOF" : PythonLexer.DefaultVocabulary.GetDisplayName(token.Type); |
| 49 | + string channelText = token.Channel == TokenConstants.DefaultChannel ? |
| 50 | + "" : |
| 51 | + $"channel={PythonLexer.channelNames[token.Channel]},"; |
| 52 | + |
| 53 | + // Modified format: [@TokenIndex,StartIndex:StopIndex='Text',<TokenName>,channel=ChannelName,Line:Column] |
| 54 | + return $"[@{token.TokenIndex},{token.StartIndex}:{token.StopIndex}='{tokenText}',<{tokenName}>,{channelText}{token.Line}:{token.Column}]"; |
145 | 55 | } |
146 | | - return true; |
147 | | - } |
148 | 56 |
|
149 | | - private static IOException BomError(string filePath, string msg) |
150 | | - { |
151 | | - return new IOException($"Invalid BOM encoding for '{Path.GetFileName(filePath)}': {msg}"); |
152 | | - } |
| 57 | + private static string ReplaceSpecialCharacters(string text) |
| 58 | + { |
| 59 | + return text.Replace("\n", @"\n") |
| 60 | + .Replace("\r", @"\r") |
| 61 | + .Replace("\t", @"\t") |
| 62 | + .Replace("\f", @"\f"); |
153 | 63 |
|
154 | | - private static string DetectEncodingFromComments(string filePath, bool hasUTF8BOM) |
155 | | - { |
156 | | - using FileStream stream = File.OpenRead(filePath); |
157 | | - if (hasUTF8BOM) stream.Seek(UTF8_BOM.Length, SeekOrigin.Begin); |
| 64 | + } |
158 | 65 |
|
159 | | - for (int i = 0; i < 2; i++) |
| 66 | + public static ICharStream? GetEncodedInputStreamByPythonComment(string filePath) |
160 | 67 | { |
161 | | - string? line = ReadAsciiLine(stream); |
162 | | - if (line == null) return ""; |
| 68 | + string encodingName = ""; |
| 69 | + var ws_commentPattern = new Regex(@"^[ \t\f]*(#.*)?$"); |
163 | 70 |
|
164 | | - if (COMMENT_PATTERN.IsMatch(line)) |
| 71 | + try |
165 | 72 | { |
166 | | - string enc = ExtractEncodingFromLine(line); |
167 | | - if (!string.IsNullOrEmpty(enc)) |
| 73 | + using FileStream fs = new(filePath, FileMode.Open, FileAccess.Read); // read in binary mode |
| 74 | + using StreamReader reader = new(fs, Encoding.ASCII); |
| 75 | + for (int lineCount = 0; lineCount < 2; lineCount++) |
168 | 76 | { |
169 | | - return enc; // encoding found in comment |
| 77 | + string? line = reader.ReadLine(); |
| 78 | + if (line == null) |
| 79 | + { |
| 80 | + break; // EOF reached |
| 81 | + } |
| 82 | + |
| 83 | + if (ws_commentPattern.IsMatch(line)) // WS? + COMMENT? found |
| 84 | + { |
| 85 | + encodingName = GetEncodingName(line); |
| 86 | + if (encodingName != "") // encoding found |
| 87 | + { |
| 88 | + break; |
| 89 | + } |
| 90 | + } |
| 91 | + else |
| 92 | + { |
| 93 | + break; // statement or backslash found (line is not empty, not whitespace(s), not comment) |
| 94 | + } |
170 | 95 | } |
171 | 96 | } |
172 | | - else |
| 97 | + catch (Exception) |
173 | 98 | { |
174 | | - break; // statement or backslash found (the line is not blank, not whitespace(s), not comment) |
| 99 | + // Console.WriteLine($"An error occurred: {e.Message}"); |
175 | 100 | } |
176 | | - } |
177 | | - return ""; |
178 | | - } |
179 | 101 |
|
180 | | - private static string? ReadAsciiLine(FileStream stream) |
181 | | - { |
182 | | - StringBuilder lineBuilder = new(); |
183 | | - int ascii; |
184 | | - while ((ascii = stream.ReadByte()) != -1) |
185 | | - { |
186 | | - if (ascii > MAX_ASCII) return null; |
187 | | - if (ascii == '\n') break; |
188 | | - if (ascii != '\r') lineBuilder.Append((char)ascii); |
189 | | - } |
190 | | - return lineBuilder.ToString(); |
191 | | - } |
192 | | - |
193 | | - private static string ExtractEncodingFromLine(string line) |
194 | | - { |
195 | | - Match m = ENCODING_PATTERN.Match(line); |
196 | | - if (!m.Success) return ""; |
197 | | - return NormalizeEncoding(m.Groups[1].Value); |
198 | | - } |
| 102 | + const string DEFAULT_PYTHON_ENCODING = "utf-8"; // default encoding for Python source code |
| 103 | + if (encodingName == "") |
| 104 | + { |
| 105 | + encodingName = DEFAULT_PYTHON_ENCODING; |
| 106 | + } |
199 | 107 |
|
200 | | - private static string NormalizeEncoding(string enc) |
201 | | - { |
202 | | - if (string.IsNullOrEmpty(enc)) return enc; |
203 | | - |
204 | | - string normalized = enc |
205 | | - .ToLower() |
206 | | - .Replace("_", "") |
207 | | - .Replace("-", "") |
208 | | - .Replace(" ", "") |
209 | | - .RegexReplace(@"codec$", ""); |
210 | | - |
211 | | - return ENCODING_MAP.TryGetValue(normalized, out var value) |
212 | | - ? value |
213 | | - : enc; |
214 | | - } |
| 108 | + try // encoding test for ANTLR4 |
| 109 | + { |
| 110 | + return CharStreams.fromPath(filePath, Encoding.GetEncoding(encodingName)); |
| 111 | + } |
| 112 | + catch (Exception) |
| 113 | + { |
| 114 | + return CharStreams.fromPath(filePath, Encoding.GetEncoding(DEFAULT_PYTHON_ENCODING)); |
| 115 | + } |
215 | 116 |
|
216 | | - private static string ResolveFinalEncoding(string filePath, |
217 | | - bool hasUTF8_BOM, |
218 | | - string commentEncodingName) |
219 | | - { |
220 | | - bool hasConflict = !string.IsNullOrEmpty(commentEncodingName) |
221 | | - && hasUTF8_BOM |
222 | | - && !IsUTF8(commentEncodingName); |
223 | | - if (hasConflict) |
224 | | - { |
225 | | - throw new IOException($"Encoding problem for '{Path.GetFileName(filePath)}': utf-8 BOM"); |
226 | 117 | } |
227 | | - return string.IsNullOrEmpty(commentEncodingName) ? UTF8 : commentEncodingName; |
228 | | - } |
229 | 118 |
|
230 | | - private static bool IsUTF8(string enc) |
231 | | - { |
232 | | - return enc.Replace("-", "").Replace("_", "").Equals("utf8", StringComparison.CurrentCultureIgnoreCase); |
233 | | - } |
234 | | -} |
| 119 | + public static string GetEncodingName(string commentText) // https://peps.python.org/pep-0263/#defining-the-encoding |
| 120 | + { |
| 121 | + var encodingCommentPattern = new Regex(@"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)"); |
| 122 | + var match = encodingCommentPattern.Match(commentText); |
| 123 | + if (match.Success) |
| 124 | + { |
| 125 | + string encodingName = match.Groups[1].Value; |
235 | 126 |
|
236 | | -// Extension method for regex replacement |
237 | | -public static class StringExtensions |
238 | | -{ |
239 | | - public static string RegexReplace(this string input, string pattern, string replacement) |
240 | | - { |
241 | | - return Regex.Replace(input, pattern, replacement); |
| 127 | + // normalize encoding name |
| 128 | + var encodingMap = new Dictionary<string, string> |
| 129 | + { |
| 130 | + { "cp1252", "latin1" }, |
| 131 | + { "latin-1", "latin1" }, |
| 132 | + { "iso-8859-1", "latin1" } |
| 133 | + // more encoding pairs |
| 134 | + }; |
| 135 | + |
| 136 | + return encodingMap.TryGetValue(encodingName.ToLower(), out var normalizedEncodingName) |
| 137 | + ? normalizedEncodingName |
| 138 | + : encodingName; |
| 139 | + } |
| 140 | + return ""; |
| 141 | + } |
242 | 142 | } |
243 | 143 | } |
0 commit comments