Skip to content

Commit

Permalink
HIVE-26047: Vectorized LIKE UDF optimization (Ryu Kobayashi, reviewed…
Browse files Browse the repository at this point in the history
… by Denys Kuzmenko)

Closes #4998
  • Loading branch information
ryukobayashi authored Apr 19, 2024
1 parent 6b48995 commit 2134e3d
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ protected interface CheckerFactory {
protected static final class NoneChecker implements Checker {
final byte [] byteSub;

NoneChecker(String pattern) {
public NoneChecker(String pattern) {
byteSub = pattern.getBytes(StandardCharsets.UTF_8);
}

Expand All @@ -250,7 +250,7 @@ public boolean check(byte[] byteS, int start, int len) {
protected static final class BeginChecker implements Checker {
final byte[] byteSub;

BeginChecker(String pattern) {
public BeginChecker(String pattern) {
byteSub = pattern.getBytes(StandardCharsets.UTF_8);
}

Expand All @@ -269,7 +269,7 @@ public boolean check(byte[] byteS, int start, int len) {
protected static final class EndChecker implements Checker {
final byte[] byteSub;

EndChecker(String pattern) {
public EndChecker(String pattern) {
byteSub = pattern.getBytes(StandardCharsets.UTF_8);
}

Expand All @@ -288,7 +288,7 @@ public boolean check(byte[] byteS, int start, int len) {
protected static final class MiddleChecker implements Checker {
final StringExpr.Finder finder;

MiddleChecker(String pattern) {
public MiddleChecker(String pattern) {
finder = StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
}

Expand Down Expand Up @@ -324,7 +324,7 @@ protected static final class ChainedChecker implements Checker {
final int beginLen;
final int endLen;

ChainedChecker(String pattern) {
public ChainedChecker(String pattern) {
final StringTokenizer tokens = new StringTokenizer(pattern, "%");
final boolean leftAnchor = pattern.startsWith("%") == false;
final boolean rightAnchor = pattern.endsWith("%") == false;
Expand Down Expand Up @@ -413,12 +413,12 @@ private int utf8Length(String chunk) {
/**
* Matches each string to a pattern with Java regular expression package.
*/
protected static class ComplexChecker implements Checker {
protected static final class ComplexChecker implements Checker {
Pattern compiledPattern;
Matcher matcher;
FastUTF8Decoder decoder;

ComplexChecker(String pattern) {
public ComplexChecker(String pattern) {
compiledPattern = Pattern.compile(pattern);
matcher = compiledPattern.matcher("");
decoder = new FastUTF8Decoder();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/*
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand All @@ -20,25 +20,27 @@

import org.apache.hadoop.hive.ql.udf.UDFLike;

import com.google.common.collect.ImmutableList;

import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Evaluate LIKE filter on a batch for a vector of strings.
*/
public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar {
private static final long serialVersionUID = 1L;

private transient final static List<CheckerFactory> checkerFactories = Arrays.asList(
new BeginCheckerFactory(),
new EndCheckerFactory(),
new MiddleCheckerFactory(),
new NoneCheckerFactory(),
new ChainedCheckerFactory(),
new ComplexCheckerFactory());
private static final List<CheckerFactory> CHECKER_FACTORIES = ImmutableList.of(
pattern -> {
UDFLikePattern udfLike = UDFLikePattern.matcher(pattern);
try {
return udfLike.checker.getConstructor(String.class).newInstance(
udfLike.format(pattern));
} catch (Exception e) {
throw new IllegalArgumentException("unable to initialize Checker");
}
});

public FilterStringColLikeStringScalar() {
super();
Expand All @@ -51,93 +53,83 @@ public FilterStringColLikeStringScalar(int colNum, byte[] likePattern) {

@Override
protected List<CheckerFactory> getCheckerFactories() {
return checkerFactories;
return CHECKER_FACTORIES;
}

/**
* Accepts simple LIKE patterns like "abc%" and creates corresponding checkers.
*/
private static class BeginCheckerFactory implements CheckerFactory {
private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%");

public Checker tryCreate(String pattern) {
Matcher matcher = BEGIN_PATTERN.matcher(pattern);
if (matcher.matches()) {
return new BeginChecker(matcher.group(1));
private enum UDFLikePattern {
// Accepts simple LIKE patterns like "abc%" and creates corresponding checkers.
BEGIN(BeginChecker.class) {
@Override
String format(String pattern) {
return pattern.substring(0, pattern.length() - 1);
}
return null;
}
}

/**
* Accepts simple LIKE patterns like "%abc" and creates a corresponding checkers.
*/
private static class EndCheckerFactory implements CheckerFactory {
private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)");

public Checker tryCreate(String pattern) {
Matcher matcher = END_PATTERN.matcher(pattern);
if (matcher.matches()) {
return new EndChecker(matcher.group(1));
},
// Accepts simple LIKE patterns like "%abc" and creates a corresponding checkers.
END(EndChecker.class) {
@Override
String format(String pattern) {
return pattern.substring(1);
}
return null;
}
}

/**
* Accepts simple LIKE patterns like "%abc%" and creates a corresponding checkers.
*/
private static class MiddleCheckerFactory implements CheckerFactory {
private static final Pattern MIDDLE_PATTERN = Pattern.compile("%([^_%]+)%");

public Checker tryCreate(String pattern) {
Matcher matcher = MIDDLE_PATTERN.matcher(pattern);
if (matcher.matches()) {
return new MiddleChecker(matcher.group(1));
},
// Accepts simple LIKE patterns like "%abc%" and creates a corresponding checkers.
MIDDLE(MiddleChecker.class) {
@Override
String format(String pattern) {
return pattern.substring(1, pattern.length() - 1);
}
return null;
}
}
},
// Accepts any LIKE patterns and creates corresponding checkers.
COMPLEX(ComplexChecker.class) {
@Override
String format(String pattern) {
return "^" + UDFLike.likePatternToRegExp(pattern) + "$";
}
},
// Accepts chained LIKE patterns without escaping like "abc%def%ghi%" and
// creates corresponding checkers.
CHAINED(ChainedChecker.class),
// Accepts simple LIKE patterns like "abc" and creates corresponding checkers.
NONE(NoneChecker.class);

/**
* Accepts simple LIKE patterns like "abc" and creates corresponding checkers.
*/
private static class NoneCheckerFactory implements CheckerFactory {
private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+");
Class<? extends Checker> checker;

public Checker tryCreate(String pattern) {
Matcher matcher = NONE_PATTERN.matcher(pattern);
if (matcher.matches()) {
return new NoneChecker(pattern);
}
return null;
UDFLikePattern(Class<? extends Checker> checker) {
this.checker = checker;
}
}

/**
* Accepts chained LIKE patterns without escaping like "abc%def%ghi%" and creates corresponding
* checkers.
*
*/
private static class ChainedCheckerFactory implements CheckerFactory {
private static final Pattern CHAIN_PATTERN = Pattern.compile("(%?[^%_\\\\]+%?)+");

public Checker tryCreate(String pattern) {
Matcher matcher = CHAIN_PATTERN.matcher(pattern);
if (matcher.matches()) {
return new ChainedChecker(pattern);
private static UDFLikePattern matcher(String pattern) {
UDFLikePattern lastType = NONE;
int length = pattern.length();
char lastChar = 0;

for (int i = 0; i < length; i++) {
char n = pattern.charAt(i);
if (n == '_' && lastChar != '\\') { // such as "a_bc"
return COMPLEX;
} else if (n == '%') {
if (i == 0) { // such as "%abc"
lastType = END;
} else if (i < length - 1) {
if (lastChar != '\\') { // such as "a%bc"
lastType = CHAINED;
}
} else {
if (lastChar != '\\') {
if (lastType == END) { // such as "%abc%"
lastType = MIDDLE;
} else if (lastType != CHAINED) {
lastType = BEGIN; // such as "abc%"
}
}
}
}
lastChar = n;
}
return null;
return lastType;
}
}

/**
* Accepts any LIKE patterns and creates corresponding checkers.
*/
private static class ComplexCheckerFactory implements CheckerFactory {
public Checker tryCreate(String pattern) {
// anchor the pattern to the start:end of the whole string.
return new ComplexChecker("^" + UDFLike.likePatternToRegExp(pattern) + "$");
String format(String pattern) {
return pattern;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4303,20 +4303,38 @@ public void testStringLikePatternType() throws HiveException {
Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "abc\\%def%".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
expr.checker.getClass());

// END pattern
expr = new FilterStringColLikeStringScalar(0, "%abc".getBytes(StandardCharsets.UTF_8));
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "%abc\\%def".getBytes(StandardCharsets.UTF_8));
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
expr.checker.getClass());

// MIDDLE pattern
expr = new FilterStringColLikeStringScalar(0, "%abc%".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "%abc\\%def%".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
expr.checker.getClass());

// CHAIN pattern
expr = new FilterStringColLikeStringScalar(0, "%abc%de".getBytes());
expr.transientInit(hiveConf);
Expand All @@ -4331,6 +4349,56 @@ public void testStringLikePatternType() throws HiveException {
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "abc_".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "abc\\_def_".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "_abc".getBytes(StandardCharsets.UTF_8));
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "_abc\\_def".getBytes(StandardCharsets.UTF_8));
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "_abc_".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "_abc\\_def_".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

expr = new FilterStringColLikeStringScalar(0, "_abc_de".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());


expr = new FilterStringColLikeStringScalar(0,
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_b".getBytes());
expr.transientInit(hiveConf);
expr.evaluate(vrb);
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());

// NONE pattern
expr = new FilterStringColLikeStringScalar(0, "abc".getBytes());
expr.transientInit(hiveConf);
Expand Down

0 comments on commit 2134e3d

Please sign in to comment.