Skip to content

Commit acb317d

Browse files
committed
BAEL-3480 Java Fast pattern matching using trie and suffix tree
1 parent 0e91643 commit acb317d

3 files changed

Lines changed: 313 additions & 0 deletions

File tree

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package com.baeldung.algorithms.suffixtree;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
6+
public class Node {
7+
private String text;
8+
private List<Node> children;
9+
private int position;
10+
11+
public Node(String word, int position) {
12+
this.text = word;
13+
this.position = position;
14+
this.children = new ArrayList<>();
15+
}
16+
17+
public String getText() {
18+
return text;
19+
}
20+
21+
public void setText(String text) {
22+
this.text = text;
23+
}
24+
25+
public int getPosition() {
26+
return position;
27+
}
28+
29+
public void setPosition(int position) {
30+
this.position = position;
31+
}
32+
33+
public List<Node> getChildren() {
34+
return children;
35+
}
36+
37+
public void setChildren(List<Node> children) {
38+
this.children = children;
39+
}
40+
41+
public String printTree(String depthIndicator) {
42+
String str = "";
43+
String positionStr = position > -1 ? "[" + String.valueOf(position) + "]" : "";
44+
str += depthIndicator + text + positionStr + "\n";
45+
46+
for (int i = 0; i < children.size(); i++) {
47+
str += children.get(i)
48+
.printTree(depthIndicator + "\t");
49+
}
50+
return str;
51+
}
52+
53+
@Override
54+
public String toString() {
55+
return printTree("");
56+
}
57+
}
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
package com.baeldung.algorithms.suffixtree;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
import java.util.stream.Collectors;
6+
7+
import org.slf4j.Logger;
8+
import org.slf4j.LoggerFactory;
9+
10+
public class SuffixTree {
11+
private static final Logger LOGGER = LoggerFactory.getLogger(SuffixTree.class);
12+
private static final String WORD_TERMINATION = "$";
13+
private static final int POSITION_UNDEFINED = -1;
14+
private Node root;
15+
private String fullText;
16+
17+
public SuffixTree() {
18+
root = new Node("", POSITION_UNDEFINED);
19+
fullText = "";
20+
}
21+
22+
public void addText(String text) {
23+
for (int i = 0; i < text.length(); i++) {
24+
addSuffix(text.substring(i) + WORD_TERMINATION, i);
25+
}
26+
fullText += text;
27+
}
28+
29+
public List<String> searchText(String pattern) {
30+
LOGGER.info("Searching for pattern \"{}\"", pattern);
31+
List<String> result = new ArrayList<>();
32+
List<Node> nodes = getAllNodesInTraversePath(pattern, root, false);
33+
34+
if (nodes.size() > 0) {
35+
Node lastNode = nodes.get(nodes.size() - 1);
36+
if (lastNode != null) {
37+
List<Integer> positions = getPositions(lastNode);
38+
positions = positions.stream()
39+
.sorted()
40+
.collect(Collectors.toList());
41+
positions.forEach(m -> result.add((markPatternInText(m, pattern))));
42+
}
43+
}
44+
return result;
45+
}
46+
47+
private void addSuffix(String suffix, int position) {
48+
LOGGER.info(">>>>>>>>>>>> Adding new suffix {}", suffix);
49+
List<Node> nodes = getAllNodesInTraversePath(suffix, root, true);
50+
if (nodes.size() == 0) {
51+
addChildNode(root, suffix, position);
52+
LOGGER.info("{}", printTree());
53+
} else {
54+
Node lastNode = nodes.remove(nodes.size() - 1);
55+
String newText = suffix;
56+
if (nodes.size() > 0) {
57+
String existingSuffixUptoLastNode = nodes.stream()
58+
.map(a -> a.getText())
59+
.reduce("", String::concat);
60+
61+
// Remove prefix from newText already included in parent
62+
newText = newText.substring(existingSuffixUptoLastNode.length());
63+
}
64+
extendNode(lastNode, newText, position);
65+
LOGGER.info("{}", printTree());
66+
}
67+
}
68+
69+
private List<Integer> getPositions(Node node) {
70+
List<Integer> positions = new ArrayList<>();
71+
if (node.getText()
72+
.endsWith(WORD_TERMINATION)) {
73+
positions.add(node.getPosition());
74+
}
75+
for (int i = 0; i < node.getChildren()
76+
.size(); i++) {
77+
positions.addAll(getPositions(node.getChildren()
78+
.get(i)));
79+
}
80+
return positions;
81+
}
82+
83+
private String markPatternInText(Integer startPosition, String pattern) {
84+
String matchingTextLHS = fullText.substring(0, startPosition);
85+
String matchingText = fullText.substring(startPosition, startPosition + pattern.length());
86+
String matchingTextRHS = fullText.substring(startPosition + pattern.length());
87+
return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS;
88+
}
89+
90+
private void addChildNode(Node parentNode, String text, int position) {
91+
parentNode.getChildren()
92+
.add(new Node(text, position));
93+
}
94+
95+
private void extendNode(Node node, String newText, int position) {
96+
String currentText = node.getText();
97+
String commonPrefix = getLongestCommonPrefix(currentText, newText);
98+
99+
if (commonPrefix != currentText) {
100+
String parentText = currentText.substring(0, commonPrefix.length());
101+
String childText = currentText.substring(commonPrefix.length());
102+
splitNodeToParentAndChild(node, parentText, childText);
103+
}
104+
105+
String remainingText = newText.substring(commonPrefix.length());
106+
addChildNode(node, remainingText, position);
107+
}
108+
109+
private void splitNodeToParentAndChild(Node parentNode, String parentNewText, String childNewText) {
110+
Node childNode = new Node(childNewText, parentNode.getPosition());
111+
112+
if (parentNode.getChildren()
113+
.size() > 0) {
114+
while (parentNode.getChildren()
115+
.size() > 0) {
116+
childNode.getChildren()
117+
.add(parentNode.getChildren()
118+
.remove(0));
119+
}
120+
}
121+
122+
parentNode.getChildren()
123+
.add(childNode);
124+
parentNode.setText(parentNewText);
125+
parentNode.setPosition(POSITION_UNDEFINED);
126+
}
127+
128+
private String getLongestCommonPrefix(String str1, String str2) {
129+
int compareLength = Math.min(str1.length(), str2.length());
130+
for (int i = 0; i < compareLength; i++) {
131+
if (str1.charAt(i) != str2.charAt(i)) {
132+
return str1.substring(0, i);
133+
}
134+
}
135+
return str1.substring(0, compareLength);
136+
}
137+
138+
private List<Node> getAllNodesInTraversePath(String pattern, Node startNode, boolean isAllowPartialMatch) {
139+
List<Node> nodes = new ArrayList<>();
140+
for (int i = 0; i < startNode.getChildren()
141+
.size(); i++) {
142+
Node currentNode = startNode.getChildren()
143+
.get(i);
144+
String nodeText = currentNode.getText();
145+
if (pattern.charAt(0) == nodeText.charAt(0)) {
146+
if (isAllowPartialMatch && pattern.length() <= nodeText.length()) {
147+
nodes.add(currentNode);
148+
return nodes;
149+
}
150+
151+
int compareLength = Math.min(nodeText.length(), pattern.length());
152+
for (int j = 1; j < compareLength; j++) {
153+
if (pattern.charAt(j) != nodeText.charAt(j)) {
154+
if (isAllowPartialMatch)
155+
nodes.add(currentNode);
156+
return nodes;
157+
}
158+
}
159+
160+
nodes.add(currentNode);
161+
if (pattern.length() > compareLength) {
162+
List<Node> nodes2 = getAllNodesInTraversePath(pattern.substring(compareLength), currentNode, isAllowPartialMatch);
163+
if (nodes2.size() == 0 && !isAllowPartialMatch) {
164+
nodes.add(null);
165+
return nodes;
166+
}
167+
nodes.addAll(nodes2);
168+
}
169+
return nodes;
170+
}
171+
}
172+
return nodes;
173+
}
174+
175+
public String printTree() {
176+
return root.printTree("");
177+
}
178+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package com.baeldung.algorithms.suffixtree;
2+
3+
import java.util.List;
4+
5+
import org.junit.Assert;
6+
import org.junit.BeforeClass;
7+
import org.junit.Test;
8+
import org.slf4j.Logger;
9+
import org.slf4j.LoggerFactory;
10+
11+
public class SuffixTreeUnitTest {
12+
13+
private static final Logger LOGGER = LoggerFactory.getLogger(SuffixTreeUnitTest.class);
14+
15+
private static SuffixTree suffixTree;
16+
17+
@BeforeClass
18+
public static void setUp() {
19+
suffixTree = new SuffixTree();
20+
suffixTree.addText("bananabanana");
21+
printTree();
22+
}
23+
24+
@Test
25+
public void givenSuffixTree_whenSearchingForA_thenReturn6Matches() {
26+
List<String> matches = suffixTree.searchText("a");
27+
matches.stream()
28+
.forEach(m -> LOGGER.info(m));
29+
Assert.assertArrayEquals(new String[] { "b[a]nanabanana", "ban[a]nabanana", "banan[a]banana", "bananab[a]nana", "bananaban[a]na", "bananabanan[a]" }, matches.toArray());
30+
}
31+
32+
@Test
33+
public void givenSuffixTree_whenSearchingForNab_thenReturn1Match() {
34+
List<String> matches = suffixTree.searchText("nab");
35+
matches.stream()
36+
.forEach(m -> LOGGER.info(m));
37+
Assert.assertArrayEquals(new String[] { "bana[nab]anana" }, matches.toArray());
38+
}
39+
40+
@Test
41+
public void givenSuffixTree_whenSearchingForNag_thenReturnNoMatches() {
42+
List<String> matches = suffixTree.searchText("nag");
43+
matches.stream()
44+
.forEach(m -> LOGGER.info(m));
45+
Assert.assertArrayEquals(new String[] {}, matches.toArray());
46+
}
47+
48+
@Test
49+
public void givenSuffixTree_whenSearchingForBanana_thenReturn2Matches() {
50+
List<String> matches = suffixTree.searchText("banana");
51+
matches.stream()
52+
.forEach(m -> LOGGER.info(m));
53+
Assert.assertArrayEquals(new String[] { "[banana]banana", "banana[banana]" }, matches.toArray());
54+
}
55+
56+
@Test
57+
public void givenSuffixTree_whenSearchingForNa_thenReturn4Matches() {
58+
List<String> matches = suffixTree.searchText("na");
59+
matches.stream()
60+
.forEach(m -> LOGGER.info(m));
61+
Assert.assertArrayEquals(new String[] { "ba[na]nabanana", "bana[na]banana", "bananaba[na]na", "bananabana[na]" }, matches.toArray());
62+
}
63+
64+
@Test
65+
public void givenSuffixTree_whenSearchingForX_thenReturnNoMatches() {
66+
List<String> matches = suffixTree.searchText("x");
67+
matches.stream()
68+
.forEach(m -> LOGGER.info(m));
69+
Assert.assertArrayEquals(new String[] {}, matches.toArray());
70+
}
71+
72+
private static void printTree() {
73+
suffixTree.printTree();
74+
75+
LOGGER.info("\n" + suffixTree.printTree());
76+
LOGGER.info("==============================================");
77+
}
78+
}

0 commit comments

Comments
 (0)