Skip to content

Commit 664e3ac

Browse files
Edan ToledoEdan Toledo
authored andcommitted
Fix tokenization error for AMR graphs and add extra information to tokens where available
1 parent 450dd9f commit 664e3ac

4 files changed

Lines changed: 72 additions & 19 deletions

File tree

src/main/java/com/RepGraph/AMRGraph.java

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,28 +42,13 @@ public AMRGraph(@JsonProperty("id") String id, @JsonProperty("source") String so
4242

4343
alignNodes();
4444

45-
populateTokens();
4645

4746

4847

49-
}
50-
51-
@Override
52-
public ArrayList<Token> extractTokensFromNodes() {
53-
ArrayList<Token> tokenlist = new ArrayList<>();
54-
55-
int index = 0;
5648

57-
PTBTokenizer<CoreLabel> ptbt = PTBTokenizer.newPTBTokenizer(new StringReader(this.input), false, true);
58-
while (ptbt.hasNext()) {
59-
CoreLabel label = ptbt.next();
60-
tokenlist.add(new Token(index, label.originalText(), label.word(), label.word()));
61-
index++;
62-
}
49+
}
6350

64-
return tokenlist;
6551

66-
}
6752

6853
public void alignUtil(String NodeID, HashMap<String, Node> nodes, HashMap<String, Boolean> visited, int layer, FileWriter writer) throws IOException {
6954
// Mark the current node as visited and print it
@@ -95,6 +80,7 @@ public void alignNodes() throws IOException, InterruptedException {
9580
for (String i : nodes.keySet()) {
9681
visited.put(i, false);
9782
}
83+
9884
FileWriter myWriter = new FileWriter("supportScripts/temp/AMR_TEMP.txt");
9985
myWriter.write("#::snt " + this.input + "\n");
10086
myWriter.write("(v" + this.top + " / " + nodes.get(this.top).getLabel() + " ");
@@ -110,6 +96,11 @@ public void alignNodes() throws IOException, InterruptedException {
11096
new BufferedReader(new InputStreamReader(proc.getInputStream()));
11197

11298
String line = null;
99+
String[] tokens = null;
100+
String[] ner_tags = null;
101+
String[] ner_iob_tags = null;
102+
String[] pos_tags = null;
103+
String[] lemmas = null;
113104
while ((line = input.readLine()) != null) {
114105

115106
if (line.startsWith("v")) {
@@ -126,8 +117,32 @@ public void alignNodes() throws IOException, InterruptedException {
126117
nodes.get(nodeid).getAnchors().add(a);
127118

128119
}
120+
if (line.startsWith("###tokens")){
121+
tokens = input.readLine().split("<###>");
122+
}
123+
if (line.startsWith("###ner_tags")){
124+
ner_tags = input.readLine().split("<###>");
125+
}
126+
if (line.startsWith("###ner_iob_tags")){
127+
ner_iob_tags = input.readLine().split("<###>");
128+
}
129+
if (line.startsWith("###pos_tags")){
130+
pos_tags = input.readLine().split("<###>");
131+
}
132+
if (line.startsWith("###lemmas")){
133+
lemmas = input.readLine().split("<###>");
134+
}
129135
}
136+
ArrayList<Token> tokenlist = new ArrayList<>();
137+
for (int i = 0; i < tokens.length ; i++) {
130138

139+
tokenlist.add(new Token(i,tokens[i],lemmas[i],null));
140+
tokenlist.get(i).getExtraInformation().put("NER",ner_tags[i]);
141+
tokenlist.get(i).getExtraInformation().put("NER_IOB",ner_iob_tags[i]);
142+
tokenlist.get(i).getExtraInformation().put("POS",pos_tags[i]);
143+
144+
}
145+
setTokens(tokenlist);
131146

132147
}
133148

src/main/java/com/RepGraph/AbstractGraph.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ public void setTokens(ArrayList<Token> tokens) {
167167

168168
public ArrayList<Token> extractTokensFromNodes() {
169169
ArrayList<Token> tokenlist = new ArrayList<>();
170-
ArrayList<String> list = new ArrayList<>();
170+
171171
int index =0;
172172

173173
PTBTokenizer<CoreLabel> ptbt = PTBTokenizer.newPTBTokenizer(new StringReader(this.input),false,true);

src/main/java/com/RepGraph/Token.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
99

10+
import java.util.HashMap;
11+
1012
@JsonIgnoreProperties(value = {"Anchors"})
1113
public class Token {
1214

@@ -30,6 +32,7 @@ public class Token {
3032
*/
3133
private String carg;
3234

35+
private HashMap<String,String> extraInformation = new HashMap<>();
3336
/**
3437
* Default constructor of the Token class.
3538
*/
@@ -50,6 +53,14 @@ public Token(int index, String form, String lemma, String carg) {
5053
this.carg = carg;
5154
}
5255

56+
public HashMap<String, String> getExtraInformation() {
57+
return extraInformation;
58+
}
59+
60+
public void setExtraInformation(HashMap<String, String> extraInformation) {
61+
this.extraInformation = extraInformation;
62+
}
63+
5364
/**
5465
* Getter method for the Token's index.
5566
* @return Integer The Token's index.

supportScripts/align_AMR.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import amrlib
22
from amrlib.alignments.rbw_aligner import RBWAligner
3-
from amrlib.graph_processing.annotator import add_lemmas
3+
from amrlib.graph_processing.annotator import add_lemmas,annotate_graph
44
import penman
55
from penman.surface import Alignment
66
import os
@@ -9,11 +9,38 @@
99
dirname = os.path.dirname(__file__)
1010
filename = os.path.join(dirname, os.path.join('temp',"AMR_TEMP.txt"))
1111
f = open(filename, "r")
12+
graph_string = f.read()
13+
14+
penman_graph = annotate_graph(graph_string)
15+
tokens = eval(penman_graph.metadata["tokens"])
16+
ner_tags = eval(penman_graph.metadata["ner_tags"])
17+
ner_iob = eval(penman_graph.metadata["ner_iob"])
18+
pos_tags = eval(penman_graph.metadata["pos_tags"])
19+
lemmas = eval(penman_graph.metadata["lemmas"])
1220

13-
penman_graph = add_lemmas(f.read(),snt_key="snt")
1421
aligner = RBWAligner.from_penman_w_json(penman_graph)
1522
aligned_graph = aligner.get_penman_graph()
1623
alignments = penman.surface._get_alignments(aligned_graph, Alignment)
1724
for key in alignments:
1825
print(key[0],alignments[key].indices)
1926

27+
print("###tokens")
28+
for token in tokens:
29+
print(token,end="<###>")
30+
31+
print("\n###ner_tags")
32+
for ner in ner_tags:
33+
print(ner,end="<###>")
34+
35+
print("\n###ner_iob_tags")
36+
for ner_iob_tag in ner_iob:
37+
print(ner_iob_tag,end="<###>")
38+
39+
print("\n###pos_tags")
40+
for pos in pos_tags:
41+
print(pos,end="<###>")
42+
43+
print("\n###lemmas")
44+
for lemma in lemmas:
45+
print(lemma,end="<###>")
46+

0 commit comments

Comments
 (0)