Skip to content

Commit d69a7a0

Browse files
author
Luther Tychonievich
committed
add language tags; fix downloading script
1 parent 96deb13 commit d69a7a0

5 files changed

Lines changed: 735 additions & 116 deletions

File tree

DownloadDefinitions.java

Lines changed: 83 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,40 @@
11
import java.net.URL;
22
import java.util.Scanner;
3-
import java.util.ArrayList;
43
import java.util.TreeMap;
4+
import java.util.TreeSet;
55
import java.io.IOException;
66
import java.io.PrintWriter;
77
import java.io.File;
88

99
public class DownloadDefinitions {
10+
static void printStringArrayCode(Iterable<String> toPrint, String name, PrintWriter out) {
11+
out.println(" private static final String[] "+name+" =");
12+
char before = '{';
13+
for(String val : toPrint) {
14+
out.println(" "+before+'"'+val+'"');
15+
before = ',';
16+
}
17+
out.println(" };");
18+
}
19+
static TreeMap<String,String> readTSV(Scanner src) {
20+
TreeMap<String,String> ans = new TreeMap<String,String>();
21+
while(src.hasNext()) {
22+
String line = src.nextLine();
23+
ans.put(line.replaceAll("\t[^\t]*$",""), line.replaceAll("[^\t]*\t",""));
24+
}
25+
return ans;
26+
}
27+
static void addTags(TreeMap<String,String> src, TreeMap<String,String> dst) {
28+
for(String key : src.keySet()) {
29+
String tag = key.split("\t")[1];
30+
String val = src.get(key);
31+
String old = dst.get(val);
32+
if (old != null && !tag.equals(old))
33+
throw new RuntimeException("ERROR: uri "+val+" has multiple tags\n\t- "+old+"\n\t- "+tag);
34+
else if (old == null) dst.put(val, tag);
35+
}
36+
}
37+
1038
public static void main(String[] args) throws IOException {
1139
try (PrintWriter dest = new PrintWriter(new File("edu/virginia/ged5to7/GedcomDefinitions.java"))) {
1240
dest.println("/* WARNING: This file is automatically generated and should not be edited by hand */");
@@ -17,28 +45,17 @@ public static void main(String[] args) throws IOException {
1745

1846

1947
Scanner s;
20-
char before;
21-
ArrayList<String> lines;
48+
TreeMap<String,String> known;
49+
TreeMap<String,String> tagOf = new TreeMap<String,String>();
2250

2351
// enumerations
2452
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/enumerations.tsv").openStream());
25-
lines = new ArrayList<String>();
26-
while(s.hasNext()) lines.add(s.nextLine());
27-
lines.sort(null);
28-
dest.println(" private static final String[] enumKeys =");
29-
before = '{';
30-
for(String line : lines) {
31-
dest.println(" "+before+'"'+line.replaceAll("\t[^\t]*$","\""));
32-
before = ',';
33-
}
34-
dest.println(" };");
35-
dest.println(" private static final String[] enumVals =");
36-
before = '{';
37-
for(String line : lines) {
38-
dest.println(" "+before+line.replaceAll(".*\t","\"")+"\"");
39-
before = ',';
40-
}
41-
dest.println(" };");
53+
known = readTSV(s);
54+
printStringArrayCode(known.keySet(), "enumKeys", dest);
55+
printStringArrayCode(known.values(), "enumVals", dest);
56+
printStringArrayCode(new TreeSet<String>(known.values()), "enumSet", dest);
57+
addTags(known, tagOf);
58+
4259
dest.println(" /** Looks up the URI of an enumeration based on the GEDCOM 7 spec");
4360
dest.println(" * @param ctx the URI of the containing structure.");
4461
dest.println(" * use <code>null</code> for an extension.");
@@ -48,38 +65,31 @@ public static void main(String[] args) throws IOException {
4865
dest.println(" public static String enumURI(String ctx, String tag) {");
4966
dest.println(" if (ctx == null) {");
5067
dest.println(" String val = \"https://gedcom.io/terms/v7/\"+tag;");
51-
dest.println(" int idx = binarySearch(enumVals, val);");
68+
dest.println(" int idx = binarySearch(enumSet, val);");
5269
dest.println(" if (idx < 0) return null;");
53-
dest.println(" return enumVals[idx];");
70+
dest.println(" return enumSet[idx];");
5471
dest.println(" } else {");
5572
dest.println(" String key = ctx+'\\t'+tag;");
5673
dest.println(" int idx = binarySearch(enumKeys, key);");
5774
dest.println(" if (idx < 0) return null;");
5875
dest.println(" return enumVals[idx];");
5976
dest.println(" }");
6077
dest.println(" }");
78+
dest.println(" public static boolean isStdEnum(String uri) {");
79+
dest.println(" return binarySearch(enumSet, uri) >= 0;");
80+
dest.println(" }");
6181

6282
dest.println();
6383

6484
// substructures
6585
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/substructures.tsv").openStream());
66-
lines = new ArrayList<String>();
67-
while(s.hasNext()) lines.add(s.nextLine());
68-
lines.sort(null);
69-
dest.println(" private static final String[] structKeys =");
70-
before = '{';
71-
for(String line : lines) {
72-
dest.println(" "+before+'"'+line.replaceAll("\t[^\t]*$","\""));
73-
before = ',';
74-
}
75-
dest.println(" };");
76-
dest.println(" private static final String[] structVals =");
77-
before = '{';
78-
for(String line : lines) {
79-
dest.println(" "+before+line.replaceAll(".*\t","\"")+"\"");
80-
before = ',';
81-
}
82-
dest.println(" };");
86+
known = readTSV(s);
87+
known.put("\tHEAD", "HEAD pseudostructure"); //// HARD-CODE based on substructures.tsv implementation
88+
printStringArrayCode(known.keySet(), "structKeys", dest);
89+
printStringArrayCode(known.values(), "structVals", dest);
90+
printStringArrayCode(new TreeSet<String>(known.values()), "structSet", dest);
91+
addTags(known, tagOf);
92+
8393
dest.println(" /** Looks up the URI of an structure type based on the GEDCOM 7 spec");
8494
dest.println(" * @param ctx the URI of the containing structure type");
8595
dest.println(" * use <code>\"\"</code> for a record and <code>null</code> for an extension.");
@@ -89,70 +99,42 @@ public static void main(String[] args) throws IOException {
8999
dest.println(" public static String structURI(String ctx, String tag) {");
90100
dest.println(" if (ctx == null) {");
91101
dest.println(" String val = \"https://gedcom.io/terms/v7/\"+tag;");
92-
dest.println(" int idx = binarySearch(structVals, val);");
102+
dest.println(" int idx = binarySearch(structSet, val);");
93103
dest.println(" if (idx < 0) return null;");
94-
dest.println(" return structVals[idx];");
104+
dest.println(" return structSet[idx];");
95105
dest.println(" } else {");
96106
dest.println(" String key = ctx+'\\t'+tag;");
97107
dest.println(" int idx = binarySearch(structKeys, key);");
98108
dest.println(" if (idx < 0) return null;");
99109
dest.println(" return structVals[idx];");
100110
dest.println(" }");
101111
dest.println(" }");
112+
dest.println(" public static boolean isStdStruct(String uri) {");
113+
dest.println(" return binarySearch(structSet, uri) >= 0;");
114+
dest.println(" }");
102115

103-
// structure types -- uses same file and substructures above
104-
TreeMap<String,String> knownStructs = new TreeMap<String, String>();
105-
for(String line : lines) {
106-
knownStructs.put(line.replaceAll(".*\t",""), line.replaceAll("^[^\t]*\t|\t[^\t]*$",""));
107-
}
108-
dest.println(" private static final String[] knownStructs =");
109-
before = '{';
110-
for(String uri : knownStructs.keySet()) {
111-
dest.println(" "+before+'"'+uri+'"');
112-
before = ',';
113-
}
114-
dest.println(" };");
115-
dest.println(" private static final String[] uriTag =");
116-
before = '{';
117-
for(String tag : knownStructs.values()) {
118-
dest.println(" "+before+'"'+tag+'"');
119-
before = ',';
120-
}
121-
dest.println(" };");
122-
dest.println(" /** Looks up the tag of a structure URIbased on the GEDCOM 7 spec");
116+
printStringArrayCode(tagOf.keySet(), "tagKeys", dest);
117+
printStringArrayCode(tagOf.values(), "tagVals", dest);
118+
119+
dest.println(" /** Looks up the tag of a structure URI based on the GEDCOM 7 spec");
123120
dest.println(" * @param uri the URI of the structure type");
124121
dest.println(" * @return the tag of the structure type, or <code>null</code> if unknown");
125122
dest.println(" */");
126123
dest.println(" public static String structTag(String uri) {");
127124
dest.println(" if (uri == null) return null;");
128-
dest.println(" int idx = binarySearch(knownStructs, uri);");
125+
dest.println(" int idx = binarySearch(tagKeys, uri);");
129126
dest.println(" if (idx < 0) return null;");
130-
dest.println(" return uriTag[idx];");
127+
dest.println(" return tagVals[idx];");
131128
dest.println(" }");
132-
133-
134129

135130
dest.println();
136131

137132
// payloads
138133
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/payloads.tsv").openStream());
139-
lines = new ArrayList<String>();
140-
while(s.hasNext()) lines.add(s.nextLine());
141-
lines.sort(null);
142-
dest.println(" private static final String[] payloadKeys =");
143-
before = '{';
144-
for(String line : lines) {
145-
dest.println(" "+before+'"'+line.replaceAll("\t[^\t]*$","\""));
146-
before = ',';
147-
}
148-
dest.println(" };");
149-
dest.println(" private static final String[] payloadVals =");
150-
before = '{';
151-
for(String line : lines) {
152-
dest.println(" "+before+line.replaceAll(".*\t","\"")+"\"");
153-
before = ',';
154-
}
155-
dest.println(" };");
134+
known = readTSV(s);
135+
printStringArrayCode(known.keySet(), "payloadKeys", dest);
136+
printStringArrayCode(known.values(), "payloadVals", dest);
137+
156138
dest.println(" /** Looks up the payload type of a structure based on the GEDCOM 7 spec");
157139
dest.println(" * @param ctx the URI of the containing structure type");
158140
dest.println(" * @return the type code (URI or <code>\"Y|<NULL>\"</code> or <code>\"\"</code> or <code>\"@XREF:</code>tag<code>\"</code>) of the payload type, or <code>null</code> if unknown");
@@ -164,6 +146,25 @@ public static void main(String[] args) throws IOException {
164146
dest.println(" return payloadVals[idx];");
165147
dest.println(" }");
166148

149+
// FHISO's language mapping
150+
s = new Scanner(new URL("https://github.com/fhiso/legacy-format/raw/master/languages.tsv").openStream());
151+
known = readTSV(s);
152+
printStringArrayCode(known.keySet(), "langKeys", dest);
153+
printStringArrayCode(known.values(), "langVals", dest);
154+
dest.println(" /** Looks up the language tag type of a language based ELF's mapping");
155+
dest.println(" * @param lang the 5.5.1 language name");
156+
dest.println(" * @return the BCP-47 language tag, or <code>null</code> if unknown");
157+
dest.println(" */");
158+
dest.println(" public static String langTag(String ctx) {");
159+
dest.println(" if (ctx == null) return null;");
160+
dest.println(" int idx = binarySearch(langKeys, ctx, String.CASE_INSENSITIVE_ORDER);");
161+
dest.println(" if (idx < 0) return null;");
162+
dest.println(" return langVals[idx].replace(\"*\",\"\");");
163+
dest.println(" }");
164+
165+
166+
167+
167168

168169
dest.println("}");
169170
}

edu/virginia/ged5to7/Converter5to7.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ public Converter5to7(String filename, int id_base) {
5757
new NoteFilter(),
5858
new SourceFilter(),
5959
new ObjectFilter(),
60+
new LanguageFilter(),
6061
};
6162
for(Filter f : filters) {
6263
java.util.LinkedList<GedStruct> created = new java.util.LinkedList<GedStruct>();

edu/virginia/ged5to7/GedStruct.java

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,22 @@ public GedStruct(String line) {
4646
public GedStruct(GedStruct sup, String tag) {
4747
this.sub = new LinkedList<GedStruct>();
4848
if (tag.indexOf(':') < 0) this.tag = tag;
49-
else this.uri = tag;
49+
else { this.uri = tag; this.uri2tag(); }
5050
if (sup != null) { sup.addSubstructure(this); this.level = sup.level+1; }
5151
else { this.sup = null; this.level = 0; }
5252
}
5353
public GedStruct(GedStruct sup, String tag, String payload) {
5454
this.sub = new LinkedList<GedStruct>();
5555
if (tag.indexOf(':') < 0) this.tag = tag;
56-
else this.uri = tag;
56+
else { this.uri = tag; this.uri2tag(); }
5757
this.payload = payload;
5858
if (sup != null) { sup.addSubstructure(this); this.level = sup.level+1; }
5959
else { this.sup = null; this.level = 0; }
6060
}
6161
public GedStruct(GedStruct sup, String tag, GedStruct payload) {
6262
this.sub = new LinkedList<GedStruct>();
6363
if (tag.indexOf(':') < 0) this.tag = tag;
64-
else this.uri = tag;
64+
else { this.uri = tag; this.uri2tag(); }
6565
this.pointsTo = payload;
6666
if (payload == null) this.payload = "@VOID@";
6767
else if (payload.incoming != null) payload.incoming.add(this);
@@ -99,8 +99,7 @@ static private String fixAtSign(String payload) {
9999
}
100100

101101
public void tag2uri() {
102-
if (sup == null && tag.equals("HEAD")) uri = "HEAD pseudostructure";
103-
else if (sup == null) uri = GedcomDefinitions.structURI("", tag);
102+
if (sup == null) uri = GedcomDefinitions.structURI("", tag);
104103
else if (sup.uri == null || GedcomDefinitions.structTag(sup.uri) == null)
105104
uri = GedcomDefinitions.structURI(null, tag);
106105
else uri = GedcomDefinitions.structURI(sup.uri, tag);
@@ -176,8 +175,6 @@ public void pointTo(GedStruct struct) {
176175
}
177176
}
178177

179-
// accumulate pointed-to-by
180-
// convert tag to URI
181178
// validate payload datatypes and pointed-to types
182179

183180
/**
@@ -202,7 +199,7 @@ else if (payload != null) {
202199
sb.append(' ');
203200
sb.append(payload.replaceAll("^@|\\n@|\\r@","$0@").replaceAll("\r\n?|\n", "\n"+(level+1)+" CONT "));
204201
}
205-
//if (incoming != null && incoming.size() > 0) sb.append(" <- "+incoming.size());
202+
//if (uri != null) sb.append(" <"+uri+">");
206203
sb.append("\n");
207204
for(GedStruct s : sub) s.serialize(sb);
208205
}

0 commit comments

Comments
 (0)