Hello, Wouter!
I remeber about my patch related to calc_hash() function, but now i concerned on regexps in the best and fastest in the world resolver ;) So, what i need to:
- I would like to filter (answering of NXDOMAIN) incoming DNS queries using a set of regular expressions, like these (in the example the set with 6 rules):
"^[a-z,0-9,-](.)?[a-z,0-9,-](xxx2)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
"^[a-z,0-9,-](.)?[a-z,0-9,-](xxx3)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
"^[a-z,0-9,-](.)?[a-z,0-9,-](xxx4)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
"^[a-z,0-9,-](.)?[a-z,0-9,-](xxx5)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
"^[a-z,0-9,-](.)?[a-z,0-9,-](xxx6)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
"^[a-z,0-9,-](.)?[a-z,0-9,-](xxx7)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
- And i want that the perfomance of Unbound is not affected by filtering incoming queries using these filters
- All the rules need to be loaded/reloaded from unbound.conf
This feature might be resolvable with Python (using python module in Unbound) but the perfomance in this case is too poor (15000 replies per second). And the bottleneck is invalidateQueryInCache() system call from Python script.
And what i have done at the moment:
- I wrote simple C-file (with its own header file) with calls from libpcre-8.43 (PCRE1). I won't give detailed descrition of the functions, i think you will everything see by youself:
- the header fastregexp/fastregexp.h:
#include <pcre.h>
struct my_regex {
pcre *my_reCompiled;
pcre_extra *my_pcreExtra;
pcre_jit_stack *my_jit_stack;
struct my_regex *next;
};
void cleanup_fast_regexp(struct my_regex *my_regex);
int do_fast_regexp(struct my_regex *my_regex, char *testString);
struct my_regex *study_fast_regexp(struct my_regex *my_regex);
struct my_regex *compile_fast_regexp(struct my_regex *my_regex, char *aRegexStrV[], int num_aRegexStrV);
- C-file fastregexp/fastregexp.c:
#include "config.h"
#include "util/log.h"
#include "fastregexp/fastregexp.h"
#include <pcre.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void cleanup_fast_regexp(struct my_regex *my_regex)
{
struct my_regex *my_regex_next;
log_err("Cleaning up all regex structures");
while(my_regex_next != NULL) {
my_regex_next = my_regex->next;
free(my_regex);
my_regex = my_regex_next;
}
}
int do_fast_regexp(struct my_regex *my_regex, char *testString)
{
int subStrVec[30];
while(my_regex != NULL) {
int pcreExecRet = pcre_jit_exec(my_regex->my_reCompiled,
my_regex->my_pcreExtra,
testString,
strlen(testString),
0,
0,
subStrVec,
30,
my_regex->my_jit_stack);
if(pcreExecRet >= 0)
return 1;
my_regex = my_regex->next;
} /* end of while */
return 0;
}
struct my_regex *study_fast_regexp(struct my_regex *my_regex)
{
pcre_extra *pcreExtra;
const char *pcreErrorStr;
struct my_regex *my_regex_start = my_regex;
pcre_jit_stack *jit_stack;
while(my_regex != NULL) {
pcreExtra = pcre_study(my_regex->my_reCompiled, PCRE_STUDY_JIT_COMPILE, &pcreErrorStr);
/* pcre_study() returns NULL for both errors and when it can not optimize the regex. The last argument is how one checks for
errors (it is NULL if everything works, and points to an error string otherwise. */
if(pcreErrorStr != NULL) {
log_err("fastregexp: JIT optimization error: %s. Cleaning up all regex structures", pcreErrorStr);
cleanup_fast_regexp(my_regex_start);
return NULL;
}
jit_stack = pcre_jit_stack_alloc(32*1024, 1024*1024);
pcre_assign_jit_stack(pcreExtra, NULL, jit_stack);
my_regex->my_pcreExtra = pcreExtra;
my_regex->my_jit_stack = jit_stack;
my_regex = my_regex->next;
} /* end of while */
return my_regex_start;
}
struct my_regex *compile_fast_regexp(struct my_regex *my_regex, char *aRegexStrV[], int num_aRegexStrV)
{
pcre *reCompiled;
const char *pcreErrorStr;
int pcreErrorOffset;
char **aStrRegex;
struct my_regex *my_regex_prev = NULL;
struct my_regex *my_regex_start = NULL;
for(int i=0; i<num_aRegexStrV; i++) {
//log_err("the regex is: %s", aRegexStrV[i]);
if((my_regex = (struct my_regex*) malloc(sizeof(struct my_regex))) == NULL) {
log_err("fastregexp: general memory allocation error");
return NULL;
}
if(my_regex_prev != NULL) {
my_regex_prev->next = my_regex;
} else {
my_regex_start = my_regex;
}
reCompiled = pcre_compile(aRegexStrV[i], 0, &pcreErrorStr, &pcreErrorOffset, NULL);
if(reCompiled == NULL) {
log_err("fastregexp: error allocating memory for PCRE stack: regex is %s: the reason: %s. Cleaning up all regex structures", aRegexStrV[i], pcreErrorStr);
cleanup_fast_regexp(my_regex_start);
return NULL;
}
my_regex->my_reCompiled = reCompiled;
my_regex->next = NULL;
my_regex_prev = my_regex;
} /* end of for */
return my_regex_start;
//pcre_free_substring(psubStrMatchStr);
pcre_free(reCompiled);
// Free up the EXTRA PCRE value (may be NULL at this point)
// if(pcreExtra != NULL) {
//#ifdef PCRE_CONFIG_JIT
// pcre_free_study(pcreExtra);
//#else
// pcre_free(pcreExtra);
//#endif
// }
}
Next, i patched your following source files:
--- unbound-1.9.2.orig/util/module.h 2019-06-17 11:50:16.000000000 +0300
+++ unbound-1.9.2/util/module.h 2019-09-16 11:54:20.302813000 +0300
@@ -156,6 +156,8 @@
#include "util/storage/lruhash.h"
#include "util/data/msgreply.h"
#include "util/data/msgparse.h"
+//igorr
+#include "fastregexp/fastregexp.h"
struct sldns_buffer;
struct alloc_cache;
struct rrset_cache;
@@ -512,6 +514,10 @@
/* Make every mesh state unique, do not aggregate mesh states. */
int unique_mesh;
-
-
/* pointer to the my_regex structure to perform fast PCRE regexp's */
-
struct my_regex *my_fast_regexp;
};
/**
--- unbound-1.9.2.orig/daemon/worker.c 2019-06-17 11:50:16.000000000 +0300
+++ unbound-1.9.2/daemon/worker.c 2019-09-17 13:00:20.176700000 +0300
@@ -1892,6 +1892,11 @@
worker->env.cfg->stat_interval);
worker_restart_timer(worker);
}
+
-
-
if((worker->env.my_fast_regexp = compile_fast_regexp(worker->env.my_fast_regexp, worker->env.cfg->regexstrv, worker->env.cfg->num_regexstrv)) != NULL)
-
worker->env.my_fast_regexp = study_fast_regexp(worker->env.my_fast_regexp);
-
}
@@ -1933,6 +1938,8 @@
alloc_clear(&worker->alloc);
regional_destroy(worker->env.scratch);
regional_destroy(worker->scratchpad);
-
-
cleanup_fast_regexp(worker->env.my_fast_regexp);
free(worker);
}
--- unbound-1.9.2.orig/iterator/iterator.c 2019-06-17 11:50:16.000000000 +0300
+++ unbound-1.9.2/iterator/iterator.c 2019-09-16 12:34:32.062665000 +0300
@@ -160,6 +160,7 @@
outbound_list_init(&iq->outlist);
iq->minimise_count = 0;
iq->minimise_timeout_count = 0;
+
if (qstate->env->cfg->qname_minimisation)
iq->minimisation_state = INIT_MINIMISE_STATE;
else
@@ -2576,6 +2577,23 @@
enum response_type type;
iq->num_current_queries--;
-
-
if(qstate->env->my_fast_regexp != NULL) {
-
-
char *my_qname_p = my_qname;
-
-
char *qdn_buf_p = qdn_buf;
-
size_t qdn_buf_len = sizeof(qdn_buf);
-
strcpy(my_qname_p, qstate->qinfo.qname);
-
size_t my_qname_len = qstate->qinfo.qname_len;
-
sldns_wire2str_dname_scan(&my_qname_p, &my_qname_len, &qdn_buf_p, &qdn_buf_len, NULL, 0);
-
if(do_fast_regexp(qstate->env->my_fast_regexp, qdn_buf) == 1) {
-
log_warn("blacklisting the domain name: %s", qdn_buf);
-
return error_response_cache(qstate, id, LDNS_RCODE_NXDOMAIN);
-
-
-
if(!inplace_cb_query_response_call(qstate->env, qstate, iq->response))
log_err("unable to call query_response callback");
--- unbound-1.9.2.orig/util/config_file.h 2019-06-17 11:50:16.000000000 +0300
+++ unbound-1.9.2/util/config_file.h 2019-09-16 13:07:10.312655000 +0300
@@ -575,6 +575,10 @@
int redis_timeout;
#endif
#endif
-
-
/* fastregexp regexp descriptions */
-
-
};
/** from cfg username, after daemonize setup performed */
--- unbound-1.9.2.orig/util/config_file.c 2019-06-17 11:50:16.000000000 +0300
+++ unbound-1.9.2/util/config_file.c 2019-09-16 17:28:00.678244000 +0300
@@ -327,6 +327,9 @@
cfg->cachedb_backend = NULL;
cfg->cachedb_secret = NULL;
#endif
-
-
-
cfg->regexstrv = NULL;
return cfg;
error_exit:
config_delete(cfg);
@@ -1092,6 +1095,8 @@
else O_STR(opt, "backend", cachedb_backend)
else O_STR(opt, "secret-seed", cachedb_secret)
#endif
-
-
else O_IFC(opt, "pattern", num_regexstrv, regexstrv)
/* not here:
* outgoing-permit, outgoing-avoid - have list of ports
* local-zone - zones and nodefault variables
@@ -1428,6 +1433,8 @@
free(cfg->cachedb_backend);
free(cfg->cachedb_secret);
#endif
-
-
config_del_strarray(cfg->regexstrv, cfg->num_regexstrv);
free(cfg);
}
--- unbound-1.9.2.orig/util/configparser.y 2019-06-17 11:50:16.000000000 +0300
+++ unbound-1.9.2/util/configparser.y 2019-09-16 17:27:35.678485000 +0300
@@ -158,6 +158,7 @@
%token VAR_IPSECMOD_MAX_TTL VAR_IPSECMOD_WHITELIST VAR_IPSECMOD_STRICT
%token VAR_CACHEDB VAR_CACHEDB_BACKEND VAR_CACHEDB_SECRETSEED
%token VAR_CACHEDB_REDISHOST VAR_CACHEDB_REDISPORT VAR_CACHEDB_REDISTIMEOUT
+%token VAR_REGEXP VAR_REGEXP_PATTERN
%token VAR_UDP_UPSTREAM_WITHOUT_DOWNSTREAM VAR_FOR_UPSTREAM
%token VAR_AUTH_ZONE VAR_ZONEFILE VAR_MASTER VAR_URL VAR_FOR_DOWNSTREAM
%token VAR_FALLBACK_ENABLED VAR_TLS_ADDITIONAL_PORT VAR_LOW_RTT VAR_LOW_RTT_PERMIL
@@ -174,7 +175,7 @@
forwardstart contents_forward | pythonstart contents_py |
rcstart contents_rc | dtstart contents_dt | viewstart contents_view |
dnscstart contents_dnsc | cachedbstart contents_cachedb |
-
authstart contents_auth | regexpstart contents_regexp
;
/* server: declaration */
@@ -2959,6 +2960,28 @@
}
}
;
+regexpstart: VAR_REGEXP
-
-
OUTYY(("\nP(regexp:)\n"));
-
-
+contents_regexp: contents_regexp content_regexp
+content_regexp: regexp_pattern
+regexp_pattern: VAR_REGEXP_PATTERN STRING_ARG
-
-
OUTYY(("P(regexp_pattern:%s)\n", $2));
-
if(cfg_parser->cfg->num_regexstrv == 0)
-
cfg_parser->cfg->regexstrv = calloc(1, sizeof(char*));
-
else cfg_parser->cfg->regexstrv = realloc(cfg_parser->cfg->regexstrv,
-
(cfg_parser->cfg->num_regexstrv+1)*sizeof(char*));
-
if(!cfg_parser->cfg->regexstrv)
-
yyerror("out of memory");
-
-
cfg_parser->cfg->regexstrv[cfg_parser->cfg->num_regexstrv++] = $2;
-
-
%%
/* parse helper routines could be here */
--- unbound-1.9.2.orig/util/configlexer.lex 2019-06-17 11:50:16.000000000 +0300
+++ unbound-1.9.2/util/configlexer.lex 2019-09-16 15:04:30.764354000 +0300
@@ -483,6 +483,8 @@
redis-server-host{COLON} { YDVAR(1, VAR_CACHEDB_REDISHOST) }
redis-server-port{COLON} { YDVAR(1, VAR_CACHEDB_REDISPORT) }
redis-timeout{COLON} { YDVAR(1, VAR_CACHEDB_REDISTIMEOUT) }
+regexp{COLON} { YDVAR(0, VAR_REGEXP) }
+pattern{COLON} { YDVAR(1, VAR_REGEXP_PATTERN) }
udp-upstream-without-downstream{COLON} { YDVAR(1, VAR_UDP_UPSTREAM_WITHOUT_DOWNSTREAM) }
tcp-connection-limit{COLON} { YDVAR(2, VAR_TCP_CONNECTION_LIMIT) }
<INITIAL,val>{NEWLINE} { LEXOUT(("NL\n")); cfg_parser->line++; }
--- unbound-1.9.2.orig/Makefile 2019-09-17 13:38:35.414726000 +0300
+++ unbound-1.9.2/Makefile 2019-09-16 12:31:51.334154000 +0300
@@ -59,14 +59,14 @@
PYTHON_CPPFLAGS=-I. -I/usr/local/include/python2.7
CFLAGS=-DSRCDIR=$(srcdir) -g -O2 -D_THREAD_SAFE -pthread
LDFLAGS=-L/usr/local/lib -L/usr/local/lib -L/usr/local/lib
-LIBS=-lutil -levent -L/usr/local/lib -L/usr/local/lib/python2.7 -L. -lpython2.7 -lcrypto -lhiredis
+LIBS=-lutil -levent -L/usr/local/lib -L/usr/local/lib/python2.7 -L. -lpython2.7 -lcrypto -lhiredis -lpcre
LIBOBJS= ${LIBOBJDIR}explicit_bzero$U.o ${LIBOBJDIR}reallocarray$U.o
filter out ctime_r from compat obj.
LIBOBJ_WITHOUT_CTIME= explicit_bzero.o reallocarray.o
LIBOBJ_WITHOUT_CTIMEARC4= explicit_bzero.o
RUNTIME_PATH= -R/usr/local/lib
DEPFLAG=-MM
-DATE=20190917
+DATE=20190912
LIBTOOL=$(libtool)
BUILD=build/
UBSYMS=-export-symbols $(srcdir)/libunbound/ubsyms.def
@@ -126,7 +126,8 @@
edns-subnet/edns-subnet.c edns-subnet/subnetmod.c
edns-subnet/addrtree.c edns-subnet/subnet-whitelist.c
cachedb/cachedb.c cachedb/redis.c respip/respip.c $(CHECKLOCK_SRC)
-$(DNSTAP_SRC) $(DNSCRYPT_SRC) $(IPSECMOD_SRC)
+$(DNSTAP_SRC) $(DNSCRYPT_SRC) $(IPSECMOD_SRC)
+fastregexp/fastregexp.c
COMMON_OBJ_WITHOUT_NETCALL=dns.lo infra.lo rrset.lo dname.lo msgencode.lo
as112.lo msgparse.lo msgreply.lo packed_rrset.lo iterator.lo iter_delegpt.lo
iter_donotq.lo iter_fwd.lo iter_hints.lo iter_priv.lo iter_resptype.lo
@@ -139,7 +140,7 @@
validator.lo val_kcache.lo val_kentry.lo val_neg.lo val_nsec3.lo val_nsec.lo
val_secalgo.lo val_sigcrypt.lo val_utils.lo dns64.lo cachedb.lo redis.lo authzone.lo
$(SUBNET_OBJ) $(PYTHONMOD_OBJ) $(CHECKLOCK_OBJ) $(DNSTAP_OBJ) $(DNSCRYPT_OBJ)
-$(IPSECMOD_OBJ) respip.lo
+$(IPSECMOD_OBJ) respip.lo fastregexp.lo
COMMON_OBJ_WITHOUT_UB_EVENT=$(COMMON_OBJ_WITHOUT_NETCALL) netevent.lo listen_dnsport.lo
outside_network.lo
COMMON_OBJ=$(COMMON_OBJ_WITHOUT_UB_EVENT) ub_event.lo
@@ -692,7 +693,7 @@
$(srcdir)/services/modstack.h $(srcdir)/util/net_help.h $(srcdir)/util/regional.h $(srcdir)/util/data/dname.h
$(srcdir)/util/data/msgencode.h $(srcdir)/util/fptr_wlist.h $(srcdir)/util/tube.h $(srcdir)/util/config_file.h
$(srcdir)/util/random.h $(srcdir)/sldns/wire2str.h $(srcdir)/sldns/str2wire.h $(srcdir)/sldns/parseutil.h \
- $(srcdir)/sldns/sbuffer.h
-
$(srcdir)/sldns/sbuffer.h $(srcdir)/fastregexp/fastregexp.h
iter_delegpt.lo iter_delegpt.o: $(srcdir)/iterator/iter_delegpt.c config.h $(srcdir)/iterator/iter_delegpt.h
$(srcdir)/util/log.h $(srcdir)/services/cache/dns.h $(srcdir)/util/storage/lruhash.h $(srcdir)/util/locks.h
$(srcdir)/util/data/msgreply.h $(srcdir)/util/data/packed_rrset.h $(srcdir)/util/regional.h
@@ -1214,7 +1215,7 @@
$(srcdir)/util/fptr_wlist.h $(srcdir)/util/tube.h $(srcdir)/util/edns.h $(srcdir)/iterator/iter_fwd.h
$(srcdir)/iterator/iter_hints.h $(srcdir)/validator/autotrust.h $(srcdir)/validator/val_anchor.h
$(srcdir)/respip/respip.h $(srcdir)/libunbound/context.h $(srcdir)/libunbound/unbound-event.h \
-
$(srcdir)/libunbound/libworker.h $(srcdir)/sldns/wire2str.h $(srcdir)/util/shm_side/shm_main.h
-
$(srcdir)/libunbound/libworker.h $(srcdir)/sldns/wire2str.h $(srcdir)/util/shm_side/shm_main.h $(srcdir)/fastregexp/fastregexp.h
testbound.lo testbound.o: $(srcdir)/testcode/testbound.c config.h $(srcdir)/testcode/testpkts.h
$(srcdir)/testcode/replay.h $(srcdir)/util/netevent.h $(srcdir)/dnscrypt/dnscrypt.h
$(srcdir)/util/rbtree.h $(srcdir)/testcode/fake_event.h
@@ -1247,7 +1248,7 @@
$(srcdir)/util/fptr_wlist.h $(srcdir)/util/tube.h $(srcdir)/util/edns.h $(srcdir)/iterator/iter_fwd.h
$(srcdir)/iterator/iter_hints.h $(srcdir)/validator/autotrust.h $(srcdir)/validator/val_anchor.h
$(srcdir)/respip/respip.h $(srcdir)/libunbound/context.h $(srcdir)/libunbound/unbound-event.h \
-
$(srcdir)/libunbound/libworker.h $(srcdir)/sldns/wire2str.h $(srcdir)/util/shm_side/shm_main.h
-
$(srcdir)/libunbound/libworker.h $(srcdir)/sldns/wire2str.h $(srcdir)/util/shm_side/shm_main.h $(srcdir)/fastregexp/fastregexp.h
acl_list.lo acl_list.o: $(srcdir)/daemon/acl_list.c config.h $(srcdir)/daemon/acl_list.h
$(srcdir)/util/storage/dnstree.h $(srcdir)/util/rbtree.h $(srcdir)/services/view.h $(srcdir)/util/locks.h
$(srcdir)/util/log.h $(srcdir)/util/regional.h $(srcdir)/util/config_file.h $(srcdir)/util/net_help.h
@@ -1462,3 +1463,4 @@
reallocarray.lo reallocarray.o: $(srcdir)/compat/reallocarray.c config.h
isblank.lo isblank.o: $(srcdir)/compat/isblank.c config.h
strsep.lo strsep.o: $(srcdir)/compat/strsep.c config.h
+fastregexp.lo fastregexp.o: $(srcdir)/fastregexp/fastregexp.c config.h $(srcdir)/fastregexp/fastregexp.h $(srcdir)/util/log.h
Thats all if i didn't forget anything. About Makefile - i know, that is the right way to patch Makefile.in. But now i'm interesting in final result of stabilty and perfomance. And yacc/lex-sources - i tried to add my two options (regexp: and pattern:) using existing declarations of config options. And it was too hard for me ;)
Now what i have:
- Perfomance - 110000-120000 replies per second (my sandbox is KVM virtual machine)
- I can add/remove regexps from unbound.conf. The structure of this config section is the following:
regexp:
pattern: "^[a-z,0-9,-](.)?[a-z,0-9,-](xxx2)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
pattern: "^[a-z,0-9,-](.)?[a-z,0-9,-](xxx3)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
pattern: "^[a-z,0-9,-](.)?[a-z,0-9,-](xxx4)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
pattern: "^[a-z,0-9,-](.)?[a-z,0-9,-](xxx5)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
pattern: "^[a-z,0-9,-](.)?[a-z,0-9,-](xxx6)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
pattern: "^[a-z,0-9,-](.)?[a-z,0-9,-](xxx7)[a-z,0-9,-].(ripn)[a-z,0-9,-.](.)?$"
But i have several issues:
- memory leakage after "unbound-conrol reload" when i adding/removing some regexp rules
- if i use redis cachedb and i added some regexp rules to filter queries that previously were serviced, such queries are resoving from redis
What i would like now - is your authoritative opinion about if all my actions is right or maybe i could (and this is most likely) be wrong in my code. Could you please revise my pathces and tell me what i have to do else
Big thank you in advance!