Skip to content

Commit 34ea488

Browse files
committed
update grun4őy
1 parent afd8c22 commit 34ea488

4 files changed

Lines changed: 412 additions & 649 deletions

File tree

port_CSharp/grun4py.cs

Lines changed: 110 additions & 210 deletions
Original file line numberDiff line numberDiff line change
@@ -1,243 +1,143 @@
1-
using Antlr4.Runtime;
2-
using System.Text;
1+
// ******* GRUN (Grammar Unit Test) for Python *******
2+
3+
using System;
34
using System.Text.RegularExpressions;
5+
using System.Text;
6+
using Antlr4.Runtime;
47

5-
/// <summary>
6-
/// GRUN (Grammar Unit Test) for Python
7-
/// </summary>
8-
public class Grun4py
8+
namespace grun4py
99
{
10-
private const string UTF8 = "utf-8";
11-
private static readonly byte[] UTF8_BOM = [0xEF, 0xBB, 0xBF];
12-
private static readonly byte[] UTF32_BE_BOM = [0x00, 0x00, 0xFE, 0xFF];
13-
private static readonly byte[] UTF32_LE_BOM = [0xFF, 0xFE, 0x00, 0x00];
14-
private static readonly byte[] UTF16_BE_BOM = [0xFE, 0xFF];
15-
private static readonly byte[] UTF16_LE_BOM = [0xFF, 0xFE];
16-
private const int MAX_BOM_LENGTH = 4;
17-
18-
private const byte MAX_ASCII = 0x7f;
19-
20-
private static readonly Regex ENCODING_PATTERN =
21-
new(@"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)", RegexOptions.Compiled);
22-
23-
private static readonly Regex COMMENT_PATTERN =
24-
new(@"^[ \t\f]*(#.*)?$", RegexOptions.Compiled);
25-
26-
private static readonly Dictionary<string, string> ENCODING_MAP = new()
27-
{
28-
// UTF-8
29-
{ "utf8sig", "utf-8" },
30-
{ "utf8", "utf-8" },
31-
{ "utf", "utf-8" },
32-
// UTF-16 LE
33-
{ "utf16le", "utf-16LE" },
34-
{ "utf16", "utf-16LE" },
35-
{ "ucs2", "utf-16LE" },
36-
{ "ucs2le", "utf-16LE" },
37-
// ISO-8859-1 / Latin-1
38-
{ "latin1", "iso-8859-1" },
39-
{ "latin", "iso-8859-1" },
40-
{ "iso88591", "iso-8859-1" },
41-
{ "iso8859", "iso-8859-1" },
42-
{ "cp819", "iso-8859-1" },
43-
// ASCII
44-
{ "ascii", "us-ascii" },
45-
{ "usascii", "us-ascii" },
46-
{ "ansiX341968", "us-ascii" },
47-
{ "cp367", "us-ascii" },
48-
// Deprecated alias
49-
{ "binary", "iso-8859-1" },
50-
};
51-
52-
public static int Main(string[] args)
10+
internal static class Program
5311
{
54-
if (args.Length < 1)
12+
public static int Main(string[] args)
5513
{
56-
Console.Error.WriteLine("Error: Please provide an input file path");
57-
return 1;
58-
}
59-
60-
string filePath = args[0];
61-
try
62-
{
63-
string encodingName = DetectEncoding(filePath);
64-
var input = CharStreams.fromPath(filePath, Encoding.GetEncoding(encodingName));
14+
if (args.Length < 1)
15+
{
16+
Console.Error.WriteLine("Error: Please provide an input file path");
17+
return 1;
18+
}
6519

66-
PythonLexer lexer = new(input);
67-
lexer.SetEncodingName(encodingName); // generate ENCODING token
20+
try
21+
{
22+
var filePath = args[0];
23+
var input = GetEncodedInputStreamByPythonComment(filePath);
24+
var lexer = new PythonLexer(input);
25+
var tokens = new CommonTokenStream((ITokenSource)lexer);
26+
var parser = new PythonParser(tokens);
27+
28+
tokens.Fill(); // Test the lexer grammar
29+
foreach (IToken t in tokens.GetTokens())
30+
{
31+
Console.WriteLine(GetTokenMetaDataWithTokenName(t));
32+
}
6833

69-
CommonTokenStream tokens = new(lexer);
70-
PythonParser parser = new(tokens);
34+
parser.file_input(); // Test the parser grammar
35+
return parser.NumberOfSyntaxErrors;
7136

72-
tokens.Fill();
73-
foreach (IToken token in tokens.GetTokens())
37+
}
38+
catch (Exception ex)
7439
{
75-
Console.WriteLine(FormatToken(token));
40+
Console.Error.WriteLine($"Error: {ex.Message}");
41+
return 1; // Error occurred, returning non-zero exit code
7642
}
77-
78-
parser.file_input();
79-
return parser.NumberOfSyntaxErrors;
80-
}
81-
catch (Exception ex)
82-
{
83-
Console.Error.WriteLine("Error: " + ex.Message);
84-
Console.Error.WriteLine(ex.StackTrace);
85-
return 1;
8643
}
87-
}
88-
89-
// ---------- Token formatting ----------
90-
91-
private static string FormatToken(IToken token)
92-
{
93-
string tokenText = EscapeSpecialChars(token.Text);
94-
string tokenName = token.Type == TokenConstants.EOF
95-
? "EOF"
96-
: PythonLexer.DefaultVocabulary.GetSymbolicName(token.Type);
9744

98-
string channelName = token.Channel == TokenConstants.DefaultChannel
99-
? ""
100-
: $"channel={token.Channel},";
101-
102-
return $"[@{token.TokenIndex},{token.StartIndex}:{token.StopIndex}='{tokenText}',<{tokenName}>,{channelName}{token.Line}:{token.Column}]";
103-
}
104-
105-
private static string EscapeSpecialChars(string text)
106-
{
107-
return text
108-
.Replace("\n", "\\n")
109-
.Replace("\r", "\\r")
110-
.Replace("\t", "\\t")
111-
.Replace("\f", "\\f");
112-
}
113-
114-
// ---------- Encoding detection ----------
115-
116-
private static string DetectEncoding(string filePath)
117-
{
118-
bool hasUTF8BOM = DetectUTF8BOM(filePath);
119-
string commentEncoding = DetectEncodingFromComments(filePath, hasUTF8BOM);
120-
return ResolveFinalEncoding(filePath, hasUTF8BOM, commentEncoding);
121-
}
122-
123-
private static bool DetectUTF8BOM(string filePath)
124-
{
125-
byte[] buffer = new byte[MAX_BOM_LENGTH];
126-
using FileStream stream = File.OpenRead(filePath);
127-
int bytesRead = stream.Read(buffer, 0, MAX_BOM_LENGTH);
128-
129-
if (BufferStartsWith(buffer, bytesRead, UTF8_BOM)) return true;
130-
if (BufferStartsWith(buffer, bytesRead, UTF32_BE_BOM)) throw BomError(filePath, "UTF-32 BE BOM");
131-
if (BufferStartsWith(buffer, bytesRead, UTF32_LE_BOM)) throw BomError(filePath, "UTF-32 LE BOM");
132-
if (BufferStartsWith(buffer, bytesRead, UTF16_BE_BOM)) throw BomError(filePath, "UTF-16 BE BOM");
133-
if (BufferStartsWith(buffer, bytesRead, UTF16_LE_BOM)) throw BomError(filePath, "UTF-16 LE BOM");
134-
return false;
135-
}
136-
137-
private static bool BufferStartsWith(byte[] buffer, int bytesRead, byte[] bom)
138-
{
139-
if (bytesRead < bom.Length) return false; // Not enough bytes to match this BOM
140-
141-
for (int i = 0; i < bom.Length; i++)
45+
private static string GetTokenMetaDataWithTokenName(IToken token)
14246
{
143-
if (buffer[i] != bom[i])
144-
return false;
47+
string tokenText = ReplaceSpecialCharacters(token.Text);
48+
string tokenName = token.Type == TokenConstants.EOF ? "EOF" : PythonLexer.DefaultVocabulary.GetDisplayName(token.Type);
49+
string channelText = token.Channel == TokenConstants.DefaultChannel ?
50+
"" :
51+
$"channel={PythonLexer.channelNames[token.Channel]},";
52+
53+
// Modified format: [@TokenIndex,StartIndex:StopIndex='Text',<TokenName>,channel=ChannelName,Line:Column]
54+
return $"[@{token.TokenIndex},{token.StartIndex}:{token.StopIndex}='{tokenText}',<{tokenName}>,{channelText}{token.Line}:{token.Column}]";
14555
}
146-
return true;
147-
}
14856

149-
private static IOException BomError(string filePath, string msg)
150-
{
151-
return new IOException($"Invalid BOM encoding for '{Path.GetFileName(filePath)}': {msg}");
152-
}
57+
private static string ReplaceSpecialCharacters(string text)
58+
{
59+
return text.Replace("\n", @"\n")
60+
.Replace("\r", @"\r")
61+
.Replace("\t", @"\t")
62+
.Replace("\f", @"\f");
15363

154-
private static string DetectEncodingFromComments(string filePath, bool hasUTF8BOM)
155-
{
156-
using FileStream stream = File.OpenRead(filePath);
157-
if (hasUTF8BOM) stream.Seek(UTF8_BOM.Length, SeekOrigin.Begin);
64+
}
15865

159-
for (int i = 0; i < 2; i++)
66+
public static ICharStream? GetEncodedInputStreamByPythonComment(string filePath)
16067
{
161-
string? line = ReadAsciiLine(stream);
162-
if (line == null) return "";
68+
string encodingName = "";
69+
var ws_commentPattern = new Regex(@"^[ \t\f]*(#.*)?$");
16370

164-
if (COMMENT_PATTERN.IsMatch(line))
71+
try
16572
{
166-
string enc = ExtractEncodingFromLine(line);
167-
if (!string.IsNullOrEmpty(enc))
73+
using FileStream fs = new(filePath, FileMode.Open, FileAccess.Read); // read in binary mode
74+
using StreamReader reader = new(fs, Encoding.ASCII);
75+
for (int lineCount = 0; lineCount < 2; lineCount++)
16876
{
169-
return enc; // encoding found in comment
77+
string? line = reader.ReadLine();
78+
if (line == null)
79+
{
80+
break; // EOF reached
81+
}
82+
83+
if (ws_commentPattern.IsMatch(line)) // WS? + COMMENT? found
84+
{
85+
encodingName = GetEncodingName(line);
86+
if (encodingName != "") // encoding found
87+
{
88+
break;
89+
}
90+
}
91+
else
92+
{
93+
break; // statement or backslash found (line is not empty, not whitespace(s), not comment)
94+
}
17095
}
17196
}
172-
else
97+
catch (Exception)
17398
{
174-
break; // statement or backslash found (the line is not blank, not whitespace(s), not comment)
99+
// Console.WriteLine($"An error occurred: {e.Message}");
175100
}
176-
}
177-
return "";
178-
}
179101

180-
private static string? ReadAsciiLine(FileStream stream)
181-
{
182-
StringBuilder lineBuilder = new();
183-
int ascii;
184-
while ((ascii = stream.ReadByte()) != -1)
185-
{
186-
if (ascii > MAX_ASCII) return null;
187-
if (ascii == '\n') break;
188-
if (ascii != '\r') lineBuilder.Append((char)ascii);
189-
}
190-
return lineBuilder.ToString();
191-
}
192-
193-
private static string ExtractEncodingFromLine(string line)
194-
{
195-
Match m = ENCODING_PATTERN.Match(line);
196-
if (!m.Success) return "";
197-
return NormalizeEncoding(m.Groups[1].Value);
198-
}
102+
const string DEFAULT_PYTHON_ENCODING = "utf-8"; // default encoding for Python source code
103+
if (encodingName == "")
104+
{
105+
encodingName = DEFAULT_PYTHON_ENCODING;
106+
}
199107

200-
private static string NormalizeEncoding(string enc)
201-
{
202-
if (string.IsNullOrEmpty(enc)) return enc;
203-
204-
string normalized = enc
205-
.ToLower()
206-
.Replace("_", "")
207-
.Replace("-", "")
208-
.Replace(" ", "")
209-
.RegexReplace(@"codec$", "");
210-
211-
return ENCODING_MAP.TryGetValue(normalized, out var value)
212-
? value
213-
: enc;
214-
}
108+
try // encoding test for ANTLR4
109+
{
110+
return CharStreams.fromPath(filePath, Encoding.GetEncoding(encodingName));
111+
}
112+
catch (Exception)
113+
{
114+
return CharStreams.fromPath(filePath, Encoding.GetEncoding(DEFAULT_PYTHON_ENCODING));
115+
}
215116

216-
private static string ResolveFinalEncoding(string filePath,
217-
bool hasUTF8_BOM,
218-
string commentEncodingName)
219-
{
220-
bool hasConflict = !string.IsNullOrEmpty(commentEncodingName)
221-
&& hasUTF8_BOM
222-
&& !IsUTF8(commentEncodingName);
223-
if (hasConflict)
224-
{
225-
throw new IOException($"Encoding problem for '{Path.GetFileName(filePath)}': utf-8 BOM");
226117
}
227-
return string.IsNullOrEmpty(commentEncodingName) ? UTF8 : commentEncodingName;
228-
}
229118

230-
private static bool IsUTF8(string enc)
231-
{
232-
return enc.Replace("-", "").Replace("_", "").Equals("utf8", StringComparison.CurrentCultureIgnoreCase);
233-
}
234-
}
119+
public static string GetEncodingName(string commentText) // https://peps.python.org/pep-0263/#defining-the-encoding
120+
{
121+
var encodingCommentPattern = new Regex(@"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)");
122+
var match = encodingCommentPattern.Match(commentText);
123+
if (match.Success)
124+
{
125+
string encodingName = match.Groups[1].Value;
235126

236-
// Extension method for regex replacement
237-
public static class StringExtensions
238-
{
239-
public static string RegexReplace(this string input, string pattern, string replacement)
240-
{
241-
return Regex.Replace(input, pattern, replacement);
127+
// normalize encoding name
128+
var encodingMap = new Dictionary<string, string>
129+
{
130+
{ "cp1252", "latin1" },
131+
{ "latin-1", "latin1" },
132+
{ "iso-8859-1", "latin1" }
133+
// more encoding pairs
134+
};
135+
136+
return encodingMap.TryGetValue(encodingName.ToLower(), out var normalizedEncodingName)
137+
? normalizedEncodingName
138+
: encodingName;
139+
}
140+
return "";
141+
}
242142
}
243143
}

0 commit comments

Comments
 (0)