Skip to content

Commit 96deb13

Browse files
author
Luther Tychonievich
committed
refactor to not depend on folder of extracted TSV
1 parent 047529b commit 96deb13

File tree

5 files changed

+4149
-71
lines changed

5 files changed

+4149
-71
lines changed

DownloadDefinitions.java

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import java.net.URL;
2+
import java.util.Scanner;
3+
import java.util.ArrayList;
4+
import java.util.TreeMap;
5+
import java.io.IOException;
6+
import java.io.PrintWriter;
7+
import java.io.File;
8+
9+
public class DownloadDefinitions {
10+
public static void main(String[] args) throws IOException {
11+
try (PrintWriter dest = new PrintWriter(new File("edu/virginia/ged5to7/GedcomDefinitions.java"))) {
12+
dest.println("/* WARNING: This file is automatically generated and should not be edited by hand */");
13+
dest.println("package edu.virginia.ged5to7;");
14+
dest.println("import static java.util.Arrays.binarySearch;");
15+
dest.println("\n/** A container for the substructure, payload, and enumeration rules from gedcom.io */");
16+
dest.println("public class GedcomDefinitions {");
17+
18+
19+
Scanner s;
20+
char before;
21+
ArrayList<String> lines;
22+
23+
// enumerations
24+
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/enumerations.tsv").openStream());
25+
lines = new ArrayList<String>();
26+
while(s.hasNext()) lines.add(s.nextLine());
27+
lines.sort(null);
28+
dest.println(" private static final String[] enumKeys =");
29+
before = '{';
30+
for(String line : lines) {
31+
dest.println(" "+before+'"'+line.replaceAll("\t[^\t]*$","\""));
32+
before = ',';
33+
}
34+
dest.println(" };");
35+
dest.println(" private static final String[] enumVals =");
36+
before = '{';
37+
for(String line : lines) {
38+
dest.println(" "+before+line.replaceAll(".*\t","\"")+"\"");
39+
before = ',';
40+
}
41+
dest.println(" };");
42+
dest.println(" /** Looks up the URI of an enumeration based on the GEDCOM 7 spec");
43+
dest.println(" * @param ctx the URI of the containing structure.");
44+
dest.println(" * use <code>null</code> for an extension.");
45+
dest.println(" * @param tag the enumeration value");
46+
dest.println(" * @return the URI of the enumeration value, or <code>null</code> if unknown");
47+
dest.println(" */");
48+
dest.println(" public static String enumURI(String ctx, String tag) {");
49+
dest.println(" if (ctx == null) {");
50+
dest.println(" String val = \"https://gedcom.io/terms/v7/\"+tag;");
51+
dest.println(" int idx = binarySearch(enumVals, val);");
52+
dest.println(" if (idx < 0) return null;");
53+
dest.println(" return enumVals[idx];");
54+
dest.println(" } else {");
55+
dest.println(" String key = ctx+'\\t'+tag;");
56+
dest.println(" int idx = binarySearch(enumKeys, key);");
57+
dest.println(" if (idx < 0) return null;");
58+
dest.println(" return enumVals[idx];");
59+
dest.println(" }");
60+
dest.println(" }");
61+
62+
dest.println();
63+
64+
// substructures
65+
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/substructures.tsv").openStream());
66+
lines = new ArrayList<String>();
67+
while(s.hasNext()) lines.add(s.nextLine());
68+
lines.sort(null);
69+
dest.println(" private static final String[] structKeys =");
70+
before = '{';
71+
for(String line : lines) {
72+
dest.println(" "+before+'"'+line.replaceAll("\t[^\t]*$","\""));
73+
before = ',';
74+
}
75+
dest.println(" };");
76+
dest.println(" private static final String[] structVals =");
77+
before = '{';
78+
for(String line : lines) {
79+
dest.println(" "+before+line.replaceAll(".*\t","\"")+"\"");
80+
before = ',';
81+
}
82+
dest.println(" };");
83+
dest.println(" /** Looks up the URI of an structure type based on the GEDCOM 7 spec");
84+
dest.println(" * @param ctx the URI of the containing structure type");
85+
dest.println(" * use <code>\"\"</code> for a record and <code>null</code> for an extension.");
86+
dest.println(" * @param tag the tag of the structure");
87+
dest.println(" * @return the URI of the structure type, or <code>null</code> if unknown");
88+
dest.println(" */");
89+
dest.println(" public static String structURI(String ctx, String tag) {");
90+
dest.println(" if (ctx == null) {");
91+
dest.println(" String val = \"https://gedcom.io/terms/v7/\"+tag;");
92+
dest.println(" int idx = binarySearch(structVals, val);");
93+
dest.println(" if (idx < 0) return null;");
94+
dest.println(" return structVals[idx];");
95+
dest.println(" } else {");
96+
dest.println(" String key = ctx+'\\t'+tag;");
97+
dest.println(" int idx = binarySearch(structKeys, key);");
98+
dest.println(" if (idx < 0) return null;");
99+
dest.println(" return structVals[idx];");
100+
dest.println(" }");
101+
dest.println(" }");
102+
103+
// structure types -- uses same file and substructures above
104+
TreeMap<String,String> knownStructs = new TreeMap<String, String>();
105+
for(String line : lines) {
106+
knownStructs.put(line.replaceAll(".*\t",""), line.replaceAll("^[^\t]*\t|\t[^\t]*$",""));
107+
}
108+
dest.println(" private static final String[] knownStructs =");
109+
before = '{';
110+
for(String uri : knownStructs.keySet()) {
111+
dest.println(" "+before+'"'+uri+'"');
112+
before = ',';
113+
}
114+
dest.println(" };");
115+
dest.println(" private static final String[] uriTag =");
116+
before = '{';
117+
for(String tag : knownStructs.values()) {
118+
dest.println(" "+before+'"'+tag+'"');
119+
before = ',';
120+
}
121+
dest.println(" };");
122+
dest.println(" /** Looks up the tag of a structure URIbased on the GEDCOM 7 spec");
123+
dest.println(" * @param uri the URI of the structure type");
124+
dest.println(" * @return the tag of the structure type, or <code>null</code> if unknown");
125+
dest.println(" */");
126+
dest.println(" public static String structTag(String uri) {");
127+
dest.println(" if (uri == null) return null;");
128+
dest.println(" int idx = binarySearch(knownStructs, uri);");
129+
dest.println(" if (idx < 0) return null;");
130+
dest.println(" return uriTag[idx];");
131+
dest.println(" }");
132+
133+
134+
135+
dest.println();
136+
137+
// payloads
138+
s = new Scanner(new URL("https://github.com/FamilySearch/GEDCOM/raw/main/extracted-files/payloads.tsv").openStream());
139+
lines = new ArrayList<String>();
140+
while(s.hasNext()) lines.add(s.nextLine());
141+
lines.sort(null);
142+
dest.println(" private static final String[] payloadKeys =");
143+
before = '{';
144+
for(String line : lines) {
145+
dest.println(" "+before+'"'+line.replaceAll("\t[^\t]*$","\""));
146+
before = ',';
147+
}
148+
dest.println(" };");
149+
dest.println(" private static final String[] payloadVals =");
150+
before = '{';
151+
for(String line : lines) {
152+
dest.println(" "+before+line.replaceAll(".*\t","\"")+"\"");
153+
before = ',';
154+
}
155+
dest.println(" };");
156+
dest.println(" /** Looks up the payload type of a structure based on the GEDCOM 7 spec");
157+
dest.println(" * @param ctx the URI of the containing structure type");
158+
dest.println(" * @return the type code (URI or <code>\"Y|<NULL>\"</code> or <code>\"\"</code> or <code>\"@XREF:</code>tag<code>\"</code>) of the payload type, or <code>null</code> if unknown");
159+
dest.println(" */");
160+
dest.println(" public static String payloadURI(String ctx) {");
161+
dest.println(" if (ctx == null) return null;");
162+
dest.println(" int idx = binarySearch(payloadKeys, ctx);");
163+
dest.println(" if (idx < 0) return null;");
164+
dest.println(" return payloadVals[idx];");
165+
dest.println(" }");
166+
167+
168+
dest.println("}");
169+
}
170+
}
171+
}

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,22 @@
1+
# Incomplete Draft
2+
13
This is an **incomplete, work-in-progress** 5.5.1-to-7.0 converter.
24
Some parts are ported directly from the C converter (such as the ANSEL Charset and date and age parsing) while others are built from the ground up. The hope is that having two somewhat-separate implementations will allow me to use the two to test one another, a hope that has already resulted in a few bug fixes in the C version.
35

4-
Current status:
6+
# Updating to new versions of GEDCOM
7+
8+
The file `edu/virginia/ged5to7/GedcomDefinitions.java` contains preprocessed copies of the TSV files from <https://github.com/FamilySearch/GEDCOM/tree/main/extracted-files>. When a new (minor or major) version of the spec is released, updates to those files will need to be incorporated by running
9+
10+
```bash
11+
javac DownloadDefinitions.java
12+
java DownloadDefinitions
13+
```
14+
15+
The above will overwrite the file `edu/virginia/ged5to7/GedcomDefinitions.java` with an updated version.
16+
17+
`DownloadDefinitions.java` is otherwise unneeded, and should not be included in distributions of the ged5to7 package.
18+
19+
# Current status
520

621
- [x] Detect character encodings, as documented in [ELF Serialisation](https://fhiso.org/TR/elf-serialisation).
722
- [x] Convert to UTF-8

edu/virginia/ged5to7/Converter5to7.java

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -19,64 +19,9 @@ public class Converter5to7 {
1919
private final int ID_TO_SKIP;
2020
private LinkedList<GedStruct> records;
2121
private List<String> log;
22-
private static String TSV_DIR = "../../GEDCOM/extracted-files";
23-
2422

2523
final static java.nio.charset.Charset UTF8 = java.nio.charset.Charset.forName("UTF-8");
2624

27-
28-
static TwoKeyMap<String> substructures; // .get(superstructure URI, tag) -> URI
29-
static Map<String, String> payloads; // .get(URI) -> payload type
30-
static TwoKeyMap<String> enumerations; // .get(structure URI, payload) -> URI
31-
static TwoKeyMap<String> uri2tag; // .get(container URI, URI) -> tag/payload
32-
33-
private static boolean setExtractedFilesDirectory(String path) {
34-
if (!Files.isDirectory(Paths.get(path))) return false;
35-
boolean something = false;
36-
if (Files.exists(Paths.get(path, "substructures.tsv"))) {
37-
if (substructures == null) substructures = new TwoKeyMap<String>();
38-
if (uri2tag == null) uri2tag = new TwoKeyMap<String>();
39-
try {
40-
Files.lines(Paths.get(path, "substructures.tsv")).forEach(line -> {
41-
String[] bits = line.split("\t");
42-
if (bits.length == 3) {
43-
substructures.put(bits[0], bits[1], bits[2]);
44-
uri2tag.put(bits[0], bits[2], bits[1]);
45-
}
46-
});
47-
something = true;
48-
} catch (IOException ex) { System.err.println(ex.toString()); }
49-
}
50-
if (Files.exists(Paths.get(path, "enumerations.tsv"))) {
51-
if (enumerations == null) enumerations = new TwoKeyMap<String>();
52-
if (uri2tag == null) uri2tag = new TwoKeyMap<String>();
53-
try {
54-
Files.lines(Paths.get(path, "enumerations.tsv")).forEach(line -> {
55-
String[] bits = line.split("\t");
56-
if (bits.length == 3) {
57-
enumerations.put(bits[0], bits[1], bits[2]);
58-
uri2tag.put(bits[0], bits[2], bits[1]);
59-
}
60-
});
61-
something = true;
62-
} catch (IOException ex) { System.err.println(ex.toString()); }
63-
}
64-
if (Files.exists(Paths.get(path, "payloads.tsv"))) {
65-
if (payloads == null) payloads = new TreeMap<String,String>();
66-
try {
67-
Files.lines(Paths.get(path, "payloads.tsv")).forEach(line -> {
68-
String[] bits = line.split("\t");
69-
if (bits.length == 2) {
70-
payloads.put(bits[0], bits[1]);
71-
}
72-
});
73-
something = true;
74-
} catch (IOException ex) { System.err.println(ex.toString()); }
75-
}
76-
return something;
77-
}
78-
79-
8025
/**
8126
* Parses file using error-tolerant algorithm and performs full 5to7 conversion.
8227
*/
@@ -191,16 +136,6 @@ private void reID() {
191136
public static void main(String[] args) {
192137
System.err.println();
193138
for(String path : args) {
194-
System.err.println("path "+path);
195-
if (Files.isDirectory(Paths.get(path))) {
196-
if (!setExtractedFilesDirectory(path)) {
197-
System.err.println(path+" is a directory");
198-
}
199-
System.err.println("Parsed " + substructures.size() + " substructure rules");
200-
System.err.println("Parsed " + enumerations.size() + " enumeration rules");
201-
System.err.println("Parsed " + payloads.size() + " payload type rules");
202-
continue;
203-
}
204139
System.err.println("\nProcessing "+path+" ...");
205140
Converter5to7 conv = new Converter5to7(path);
206141
try { conv.dumpTo(System.out); } catch (IOException ex) { ex.printStackTrace(); }

edu/virginia/ged5to7/GedStruct.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,16 +99,16 @@ static private String fixAtSign(String payload) {
9999
}
100100

101101
public void tag2uri() {
102-
if (Converter5to7.substructures == null) return;
103102
if (sup == null && tag.equals("HEAD")) uri = "HEAD pseudostructure";
104-
else if (sup == null) uri = Converter5to7.substructures.get("", tag);
105-
else if (sup.uri != null) uri = Converter5to7.substructures.get(sup.uri, tag);
103+
else if (sup == null) uri = GedcomDefinitions.structURI("", tag);
104+
else if (sup.uri == null || GedcomDefinitions.structTag(sup.uri) == null)
105+
uri = GedcomDefinitions.structURI(null, tag);
106+
else uri = GedcomDefinitions.structURI(sup.uri, tag);
106107
for(GedStruct kid : sub) kid.tag2uri();
107108
}
108109
public void uri2tag() {
109-
if (Converter5to7.uri2tag == null) return;
110110
if (uri != null) {
111-
String tag2 = Converter5to7.uri2tag.get((sup == null || sup.uri == null) ? "" : sup.uri, uri);
111+
String tag2 = GedcomDefinitions.structTag(uri);
112112
if (tag2 != null) tag = tag2;
113113
}
114114
for(GedStruct kid : sub) kid.uri2tag();

0 commit comments

Comments
 (0)