Skip to content

Commit 6d67808

Browse files
author
Luther Tychonievich
committed
add language parsing and other lookups
1 parent 3c8820d commit 6d67808

File tree

6 files changed

+188
-7563
lines changed

6 files changed

+188
-7563
lines changed

DownloadDefinitions.java

Lines changed: 61 additions & 215 deletions
Original file line numberDiff line numberDiff line change
@@ -1,228 +1,74 @@
11
import java.net.URL;
22
import java.util.Scanner;
3-
import java.util.TreeMap;
4-
import java.util.TreeSet;
53
import java.io.IOException;
6-
import java.io.PrintWriter;
7-
import java.io.File;
4+
import java.io.BufferedInputStream;
5+
import java.io.FileOutputStream;
86

97
public class DownloadDefinitions {
10-
static void printStringArrayCode(Iterable<String> toPrint, String name, PrintWriter out) {
11-
out.println(" private static final String[] "+name+" =");
12-
char before = '{';
13-
for(String val : toPrint) {
14-
out.println(" "+before+'"'+val+'"');
15-
before = ',';
16-
}
17-
out.println(" };");
18-
}
19-
static void printBooleanArrayCode(Iterable<Boolean> toPrint, String name, PrintWriter out) {
20-
out.println(" private static final boolean[] "+name+" =");
21-
char before = '{';
22-
for(boolean val : toPrint) {
23-
out.println(" "+before+val);
24-
before = ',';
8+
9+
private static void download(String urlname, String filename) {
10+
try {
11+
URL url = new URL(urlname);
12+
try(BufferedInputStream bis = new BufferedInputStream(url.openStream())) {
13+
try(FileOutputStream fos = new FileOutputStream(filename)) {
14+
byte[] buffer = new byte[1024];
15+
int count=0;
16+
while((count = bis.read(buffer,0,buffer.length)) != -1) {
17+
fos.write(buffer, 0, count);
18+
}
19+
}
20+
}
21+
} catch (IOException ex) {
22+
System.err.println("Unable to download\n from: "+urlname+ "\n to: "+filename+"\n "+ex);
2523
}
26-
out.println(" };");
2724
}
28-
static void printStringArrayArrayCode(Iterable<? extends Iterable<String>> toPrint, String name, PrintWriter out) {
29-
out.println(" private static final String[][] "+name+" =");
30-
char before = '{';
31-
for(Iterable<String> val : toPrint) {
32-
out.println(" "+before+"{\""+String.join("\",\"", val)+"\"}");
33-
before = ',';
25+
26+
private static void downloadIANALanguageSubtagRegistery(String filename) {
27+
try {
28+
URL url = new URL("https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry");
29+
Scanner from = new Scanner(url.openStream());
30+
try(FileOutputStream fos = new FileOutputStream(filename)) {
31+
boolean isLang = false;
32+
String tag = null;
33+
while(from.hasNextLine()) {
34+
String line = from.nextLine();
35+
if (line.startsWith("Type: ")) isLang = line.equals("Type: language");
36+
else if (line.startsWith("Subtag: ")) tag = line.substring(8);
37+
else if (isLang && line.startsWith("Description: ")) {
38+
String key = line.substring(13);
39+
fos.write((key+"\t"+tag+"\n").getBytes("UTF-8"));
40+
}
41+
}
42+
}
43+
} catch (IOException ex) {
44+
System.err.println("Unable to download\n from: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry\n to: "+filename+"\n "+ex);
3445
}
35-
out.println(" };");
3646
}
3747

38-
39-
static TreeMap<String,String> readTSV(Scanner src) {
40-
TreeMap<String,String> ans = new TreeMap<String,String>();
41-
while(src.hasNext()) {
42-
String line = src.nextLine();
43-
ans.put(line.replaceAll("\t[^\t]*$",""), line.replaceAll("[^\t]*\t",""));
44-
}
45-
return ans;
46-
}
47-
static void addTags(TreeMap<String,String> src, TreeMap<String,String> dst) {
48-
for(String key : src.keySet()) {
49-
String tag = key.split("\t")[1];
50-
String val = src.get(key);
51-
String old = dst.get(val);
52-
if (old != null && !tag.equals(old))
53-
throw new RuntimeException("ERROR: uri "+val+" has multiple tags\n\t- "+old+"\n\t- "+tag);
54-
else if (old == null) dst.put(val, tag);
55-
}
56-
}
5748

58-
public static void main(String[] args) throws IOException {
59-
try (PrintWriter dest = new PrintWriter(new File("edu/virginia/ged5to7/GedcomDefinitions.java"))) {
60-
dest.println("/* WARNING: This file is automatically generated and should not be edited by hand */");
61-
dest.println("package edu.virginia.ged5to7;");
62-
dest.println("import static java.util.Arrays.binarySearch;");
63-
dest.println("\n/** A container for the substructure, payload, and enumeration rules from gedcom.io */");
64-
dest.println("public class GedcomDefinitions {");
65-
66-
67-
Scanner s;
68-
TreeMap<String,String> known;
69-
TreeMap<String,String> tagOf = new TreeMap<String,String>();
70-
71-
// enumerations
72-
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/enumerations.tsv").openStream());
73-
known = readTSV(s);
74-
printStringArrayCode(known.keySet(), "enumKeys", dest);
75-
printStringArrayCode(known.values(), "enumVals", dest);
76-
printStringArrayCode(new TreeSet<String>(known.values()), "enumSet", dest);
77-
addTags(known, tagOf);
78-
79-
dest.println(" /** Looks up the URI of an enumeration based on the GEDCOM 7 spec");
80-
dest.println(" * @param ctx the URI of the containing structure.");
81-
dest.println(" * use <code>null</code> for an extension.");
82-
dest.println(" * @param tag the enumeration value");
83-
dest.println(" * @return the URI of the enumeration value, or <code>null</code> if unknown");
84-
dest.println(" */");
85-
dest.println(" public static String enumURI(String ctx, String tag) {");
86-
dest.println(" if (ctx == null) {");
87-
dest.println(" String val = \"https://gedcom.io/terms/v7/\"+tag;");
88-
dest.println(" int idx = binarySearch(enumSet, val);");
89-
dest.println(" if (idx < 0) return null;");
90-
dest.println(" return enumSet[idx];");
91-
dest.println(" } else {");
92-
dest.println(" String key = ctx+'\\t'+tag;");
93-
dest.println(" int idx = binarySearch(enumKeys, key);");
94-
dest.println(" if (idx < 0) return null;");
95-
dest.println(" return enumVals[idx];");
96-
dest.println(" }");
97-
dest.println(" }");
98-
dest.println(" public static boolean isStdEnum(String uri) {");
99-
dest.println(" return binarySearch(enumSet, uri) >= 0;");
100-
dest.println(" }");
101-
102-
dest.println();
103-
104-
// substructures
105-
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/substructures.tsv").openStream());
106-
known = readTSV(s);
107-
known.put("\tHEAD", "HEAD pseudostructure"); //// HARD-CODE based on substructures.tsv implementation
108-
printStringArrayCode(known.keySet(), "structKeys", dest);
109-
printStringArrayCode(known.values(), "structVals", dest);
110-
printStringArrayCode(new TreeSet<String>(known.values()), "structSet", dest);
111-
addTags(known, tagOf);
112-
113-
dest.println(" /** Looks up the URI of an structure type based on the GEDCOM 7 spec");
114-
dest.println(" * @param ctx the URI of the containing structure type");
115-
dest.println(" * use <code>\"\"</code> for a record and <code>null</code> for an extension.");
116-
dest.println(" * @param tag the tag of the structure");
117-
dest.println(" * @return the URI of the structure type, or <code>null</code> if unknown");
118-
dest.println(" */");
119-
dest.println(" public static String structURI(String ctx, String tag) {");
120-
dest.println(" if (ctx == null) {");
121-
dest.println(" String val = \"https://gedcom.io/terms/v7/\"+tag;");
122-
dest.println(" int idx = binarySearch(structSet, val);");
123-
dest.println(" if (idx < 0) return null;");
124-
dest.println(" return structSet[idx];");
125-
dest.println(" } else {");
126-
dest.println(" String key = ctx+'\\t'+tag;");
127-
dest.println(" int idx = binarySearch(structKeys, key);");
128-
dest.println(" if (idx < 0) return null;");
129-
dest.println(" return structVals[idx];");
130-
dest.println(" }");
131-
dest.println(" }");
132-
dest.println(" public static boolean isStdStruct(String uri) {");
133-
dest.println(" return binarySearch(structSet, uri) >= 0;");
134-
dest.println(" }");
135-
136-
printStringArrayCode(tagOf.keySet(), "tagKeys", dest);
137-
printStringArrayCode(tagOf.values(), "tagVals", dest);
138-
139-
dest.println(" /** Looks up the tag of a structure URI based on the GEDCOM 7 spec");
140-
dest.println(" * @param uri the URI of the structure type");
141-
dest.println(" * @return the tag of the structure type, or <code>null</code> if unknown");
142-
dest.println(" */");
143-
dest.println(" public static String structTag(String uri) {");
144-
dest.println(" if (uri == null) return null;");
145-
dest.println(" int idx = binarySearch(tagKeys, uri);");
146-
dest.println(" if (idx < 0) return null;");
147-
dest.println(" return tagVals[idx];");
148-
dest.println(" }");
149-
150-
dest.println();
151-
152-
// payloads
153-
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/payloads.tsv").openStream());
154-
known = readTSV(s);
155-
printStringArrayCode(known.keySet(), "payloadKeys", dest);
156-
printStringArrayCode(known.values(), "payloadVals", dest);
157-
158-
dest.println(" /** Looks up the payload type of a structure based on the GEDCOM 7 spec");
159-
dest.println(" * @param ctx the URI of the containing structure type");
160-
dest.println(" * @return the type code (URI or <code>\"Y|<NULL>\"</code> or <code>\"\"</code> or <code>\"@XREF:</code>tag<code>\"</code>) of the payload type, or <code>null</code> if unknown");
161-
dest.println(" */");
162-
dest.println(" public static String payloadURI(String ctx) {");
163-
dest.println(" if (ctx == null) return null;");
164-
dest.println(" int idx = binarySearch(payloadKeys, ctx);");
165-
dest.println(" if (idx < 0) return null;");
166-
dest.println(" return payloadVals[idx];");
167-
dest.println(" }");
168-
169-
dest.println();
170-
171-
// cardinalities
172-
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/cardinalities.tsv").openStream());
173-
known = readTSV(s);
174-
TreeMap<String, TreeSet<String>> required = new TreeMap<String,TreeSet<String>>();
175-
TreeMap<String, Boolean> singular = new TreeMap<String,Boolean>();
176-
known.forEach((k,v) -> {
177-
if (v.charAt(1) == '1') {
178-
String[] k2 = k.split("\t");
179-
required.putIfAbsent(k2[0], new TreeSet<String>());
180-
required.get(k2[0]).add(k2[1]);
181-
}
182-
singular.put(k, v.charAt(3) == '1');
183-
});
184-
printStringArrayCode(required.keySet(), "reqKeys", dest);
185-
printStringArrayArrayCode(required.values(), "reqVals", dest);
186-
printStringArrayCode(singular.keySet(), "singleKeys", dest);
187-
printBooleanArrayCode(singular.values(), "singleVals", dest);
188-
189-
dest.println(" public static String[] requiredSubstructures(String struct) {");
190-
dest.println(" if (struct == null) return new String[0];");
191-
dest.println(" int idx = binarySearch(reqKeys, struct);");
192-
dest.println(" if (idx < 0) return new String[0];");
193-
dest.println(" return reqVals[idx];");
194-
dest.println(" }");
195-
196-
dest.println(" public static boolean justOne(String ctx, String uri) {");
197-
dest.println(" if (ctx == null) return false;");
198-
dest.println(" String key = ctx+'\\t'+uri;");
199-
dest.println(" int idx = binarySearch(singleKeys, key);");
200-
dest.println(" if (idx < 0) return false;");
201-
dest.println(" return singleVals[idx];");
202-
dest.println(" }");
203-
204-
205-
206-
dest.println();
207-
208-
209-
// FHISO's language mapping
210-
s = new Scanner(new URL("https://github.com/fhiso/legacy-format/raw/master/languages.tsv").openStream());
211-
known = readTSV(s);
212-
printStringArrayCode(known.keySet(), "langKeys", dest);
213-
printStringArrayCode(known.values(), "langVals", dest);
214-
dest.println(" /** Looks up the language tag type of a language based ELF's mapping");
215-
dest.println(" * @param lang the 5.5.1 language name");
216-
dest.println(" * @return the BCP-47 language tag, or <code>null</code> if unknown");
217-
dest.println(" */");
218-
dest.println(" public static String langTag(String ctx) {");
219-
dest.println(" if (ctx == null) return null;");
220-
dest.println(" int idx = binarySearch(langKeys, ctx, String.CASE_INSENSITIVE_ORDER);");
221-
dest.println(" if (idx < 0) return null;");
222-
dest.println(" return langVals[idx].replace(\"*\",\"\");");
223-
dest.println(" }");
224-
225-
dest.println("}");
226-
}
49+
public static void main(String[] args) {
50+
download(
51+
"https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/enumerations.tsv",
52+
"edu/virginia/ged5to7/config/enumerations.tsv"
53+
);
54+
download(
55+
"https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/payloads.tsv",
56+
"edu/virginia/ged5to7/config/payloads.tsv"
57+
);
58+
download(
59+
"https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/substructures.tsv",
60+
"edu/virginia/ged5to7/config/substructures.tsv"
61+
);
62+
download(
63+
"https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/cardinalities.tsv",
64+
"edu/virginia/ged5to7/config/cardinalities.tsv"
65+
);
66+
download(
67+
"https://github.com/fhiso/legacy-format/raw/master/languages.tsv",
68+
"edu/virginia/ged5to7/config/languages.tsv"
69+
);
70+
downloadIANALanguageSubtagRegistery(
71+
"edu/virginia/ged5to7/config/all-languages.tsv"
72+
);
22773
}
22874
}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ The above will overwrite the file `edu/virginia/ged5to7/GedcomDefinitions.java`
4242
- [x] change `OBJE` with no payload to pointer to new `OBJE` record
4343
- [x] change `NOTE` record or with pointer payload into `SNOTE`
4444
- [x] use heuristic to change some pointer-`NOTE` to nested-`NOTE` instead of `SNOTE`
45-
- [ ] Convert `LANG` payloads to BCP 47 tags, using [FHISO's mapping](https://github.com/fhiso/legacy-format/blob/master/languages.tsv)
45+
- [x] Convert `LANG` payloads to BCP 47 tags, using [FHISO's mapping](https://github.com/fhiso/legacy-format/blob/master/languages.tsv)
4646
- [ ] Convert `MEDI`.`FORM` payloads to media types
4747
- [ ] Enumerated values
4848
- [ ] Normalize case

edu/virginia/ged5to7/GedStruct.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,16 @@ static private String fixAtSign(String payload) {
9999
}
100100

101101
public void tag2uri() {
102-
if (sup == null) uri = GedcomDefinitions.structURI("", tag);
103-
else if (sup.uri == null || GedcomDefinitions.structTag(sup.uri) == null)
104-
uri = GedcomDefinitions.structURI(null, tag);
105-
else uri = GedcomDefinitions.structURI(sup.uri, tag);
102+
GedcomDefinitions def = GedcomDefinitions.getDefinitions();
103+
if (sup == null) uri = def.structURI("", tag);
104+
else if (sup.uri == null || def.structTag(sup.uri) == null)
105+
uri = def.structURI(null, tag);
106+
else uri = def.structURI(sup.uri, tag);
106107
for(GedStruct kid : sub) kid.tag2uri();
107108
}
108109
public void uri2tag() {
109110
if (uri != null) {
110-
String tag2 = GedcomDefinitions.structTag(uri);
111+
String tag2 = GedcomDefinitions.getDefinitions().structTag(uri);
111112
if (tag2 != null) tag = tag2;
112113
}
113114
for(GedStruct kid : sub) kid.uri2tag();

0 commit comments

Comments
 (0)