Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ Regex lower = all.diff(upper);
System.out.println(lower.matches("aaa")); // true
System.out.println(lower.matches("Aaa")); // false

System.out.println(lower.matches(new ByteArrayInputStream("aaa".getBytes(StandardCharsets.UTF_8)))); // true
System.out.println(lower.matches(new ByteArrayInputStream("Aaa".getBytes(StandardCharsets.UTF_8)))); // false

```

The motivating use case was detecting non-intersecting expressions. Once it can be established that a set of expressions do not intersect (that they are disjoint) it becomes possible to short-circuit evaluations. Moreover, they can be tested in any order, allowing for reordering based on matching statistics. This is especially important in performance-critical paths where multiple expressions are matched, such as in load balancers.
Expand Down
34 changes: 34 additions & 0 deletions src/main/java/dregex/Regex.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package dregex;

import dregex.impl.*;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -42,6 +45,20 @@ public boolean matches(CharSequence input) {
return matchAndReport(input).matches();
}

/**
* Return whether the input stream is matched by the regular expression (i.e. whether the stream is included in the
* language generated by the expression). As the match is done using a DFA, its complexity is O(n), where n is the
* length of the stream. It is constant with respect to the length of the expression.
*
* @param input the stream to match
*
* @return whether the input matches the regex
* @throws IOException if an I/O error occurs while reading the file
*/
public boolean matches(InputStream input) throws IOException {
return matchAndReport(input).matches();
}

/**
* Return whether the input string is matched by the regular expression (i.e. whether the string is included in the
* language generated by the expression). As the match is done using a DFA, its complexity is O(n), where n is the
Expand All @@ -58,6 +75,23 @@ public MatchResult matchAndReport(CharSequence input) {
return regexImpl.matchAndReport(input);
}

/**
* Return whether the input stream is matched by the regular expression (i.e. whether the string is included in the
* language generated by the expression). As the match is done using a DFA, its complexity is O(n), where n is the
* length of the string. It is constant with respect to the length of the expression.
* <p>
* This method is similar to method {@link #matches(InputStream)}, except that also return how many characters
* were successfully matched in case of failure.
*
* @param input the Inputstream to match
*
* @return an object with information about the matching attempt
* @throws IOException if an I/O error occurs while reading the file
*/
public MatchResult matchAndReport(InputStream input) throws IOException {
return regexImpl.matchAndReport(input);
}

/**
* Intersect this regular expression with another. The resulting expression will match the strings that are
* matched by the operands, and only those. Intersections take O(n⋅m) time, where n and m are the number of states of
Expand Down
63 changes: 63 additions & 0 deletions src/main/java/dregex/impl/DfaAlgorithms.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package dregex.impl;

import dregex.MatchResult;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.function.BiPredicate;
import java.util.function.Function;
Expand Down Expand Up @@ -208,6 +213,64 @@ public static MatchResult matchString(Dfa dfa, CharSequence string) {
return new MatchResult(dfa.accepting.contains(current), i);
}

public static MatchResult matchInputStream(Dfa dfa, InputStream inputStream) throws IOException {
// Start from the initial state of the DFA
State currentState = dfa.initial;

// Current position in the input stream
int position = 0;

// Read the input stream character by character
try (InputStreamReader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
int ch;
while ((ch = reader.read()) != -1) {
// Read the next Unicode code point (handles surrogate pairs)
int codePoint = readCodePoint(reader, ch);
if (codePoint == -1) break; // End of stream or invalid surrogate pair

// Get the next DFA state based on the current state and input character
State nextState = getNextState(dfa, currentState, codePoint);

if (nextState == null) {
// If not accepting, return failure at current position
return new MatchResult(false, position);
}

// Move to the next state and increment position
currentState = nextState;
position++;
}
}

// After reading the stream, check if the current state is accepting
return new MatchResult(dfa.accepting.contains(currentState), position);
}

// Reads a Unicode code point from the stream, handling surrogate pairs if needed
private static int readCodePoint(InputStreamReader reader, int firstChar) throws IOException {
char c1 = (char) firstChar;
if (Character.isHighSurrogate(c1)) {
int ch2 = reader.read();
if (ch2 == -1) return -1; // Incomplete surrogate pair
char c2 = (char) ch2;
return Character.toCodePoint(c1, c2);
}
return c1;
}

// Retrieves the next DFA state based on the current state and input code point
private static State getNextState(Dfa dfa, State current, int codePoint) {
TreeMap<CharInterval, State> transitions = dfa.defTransitions.get(current);
if (transitions == null) return null;

// Find the transition whose interval includes the code point
var entry = transitions.floorEntry(new CharInterval(codePoint, codePoint));
if (entry != null && codePoint <= entry.getKey().to) {
return entry.getValue();
}
return null;
}

/**
* Each DFA is also trivially a NFA, return it.
*/
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/dregex/impl/RegexImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import dregex.IncompatibleRegexException;
import dregex.MatchResult;
import java.io.IOException;
import java.io.InputStream;
import java.text.Normalizer;
import java.time.Duration;
import org.slf4j.Logger;
Expand Down Expand Up @@ -48,6 +50,10 @@ public MatchResult matchAndReport(CharSequence string) {
return DfaAlgorithms.matchString(dfa, builder.toString());
}

public MatchResult matchAndReport(InputStream inputStream) throws IOException {
return DfaAlgorithms.matchInputStream(dfa, inputStream);
}

public RegexImpl intersect(RegexImpl other) {
checkUniverse(other);
var start = System.nanoTime();
Expand Down
Loading