Reduce blacklists RAM usage and increase limit to 1500k rules

sparsehash is now used to reduce memory usage of malware blacklists.
uthash introduces a 56 bytes overhead per-item (UT_hash_handle).
sparsehash, on the other, hand, only has ~16 bytes of overhead (HTItem).
This can also be further reduced. With 400k rules, this currently
provides a 25 MB reduced usage. Lookup performance is also similar
to uthash.
This commit is contained in:
emanuele-f 2025-02-09 16:55:41 +01:00
parent a73b0bc1cf
commit 94d13885bb
5 changed files with 1809 additions and 26 deletions

View File

@ -1,3 +1,4 @@
project(common C)
ADD_LIBRARY(common STATIC uid_lru.c utils.c jni_utils.c uid_resolver.c)
ADD_LIBRARY(common STATIC uid_lru.c utils.c jni_utils.c uid_resolver.c
../third_party/libchash.c)

View File

@ -21,11 +21,7 @@
#include <string.h>
#include "pcapdroid.h"
#include "common/utils.h"
typedef struct {
char *key;
UT_hash_handle hh;
} string_entry_t;
#include "third_party/libchash.h"
typedef struct {
char country_code[3];
@ -38,7 +34,7 @@ typedef struct {
} int_entry_t;
struct blacklist {
string_entry_t *domains;
struct HashTable *domains;
int_entry_t *uids;
ndpi_ptree_t *ptree;
country_entry_t* countries;
@ -58,6 +54,13 @@ blacklist_t* blacklist_init() {
return NULL;
}
bl->domains = AllocateHashTable(0 /* keys are null terminated */, 1 /* copy keys */);
if (!bl->domains) {
ndpi_ptree_destroy(bl->ptree);
bl_free(bl);
return NULL;
}
return bl;
}
@ -70,17 +73,10 @@ int blacklist_add_domain(blacklist_t *bl, const char *domain) {
if(blacklist_match_domain(bl, domain))
return -EADDRINUSE; // duplicate domain
string_entry_t *entry = bl_malloc(sizeof(string_entry_t));
if(!entry)
HTItem* entry = HashInsert(bl->domains, PTR_KEY(bl->domains, domain), 0);
if (!entry)
return -ENOMEM;
entry->key = bl_strdup(domain);
if(!entry->key) {
bl_free(entry);
return -ENOMEM;
}
HASH_ADD_KEYPTR(hh, bl->domains, entry->key, strlen(entry->key), entry);
bl->stats.num_domains++;
return 0;
}
@ -172,7 +168,7 @@ int blacklist_load_file(blacklist_t *bl, const char *path, blacklist_type btype,
FILE *f;
char buffer[256];
int num_ok = 0, num_fail = 0, num_dup = 0;
int max_file_rules = 500000;
int max_file_rules = 15000000;
f = fopen(path, "r");
if(!f) {
@ -270,12 +266,7 @@ int blacklist_load_file(blacklist_t *bl, const char *path, blacklist_type btype,
/* ******************************************************* */
void blacklist_destroy(blacklist_t *bl) {
string_entry_t *entry, *tmp;
HASH_ITER(hh, bl->domains, entry, tmp) {
HASH_DELETE(hh, bl->domains, entry);
bl_free(entry->key);
bl_free(entry);
}
FreeHashTable(bl->domains);
int_entry_t *entry_i, *tmp_i;
HASH_ITER(hh, bl->uids, entry_i, tmp_i) {
@ -350,20 +341,21 @@ static char* get_second_level_domain(const char *domain) {
bool blacklist_match_domain(blacklist_t *bl, const char *domain) {
// Keep in sync with MatchList.matchesHost
string_entry_t *entry = NULL;
HashTable* ht = bl->domains;
HTItem *entry = NULL;
if(strncmp(domain, "www.", 4) == 0)
domain += 4;
// exact domain match
HASH_FIND_STR(bl->domains, domain, entry);
entry = HashFind(ht, PTR_KEY(ht, domain));
if(entry != NULL)
return true;
// 2nd-level domain match
char *domain2 = get_second_level_domain(domain);
if(domain2 != domain) {
HASH_FIND_STR(bl->domains, domain2, entry);
entry = HashFind(ht, PTR_KEY(ht, domain2));
if(entry != NULL)
return true;
}

1537
app/src/main/jni/third_party/libchash.c vendored Normal file

File diff suppressed because it is too large Load Diff

252
app/src/main/jni/third_party/libchash.h vendored Normal file
View File

@ -0,0 +1,252 @@
/* Copyright (c) 1998 - 2005, Google Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ---
* Author: Craig Silverstein
*
* This library is intended to be used for in-memory hash tables,
* though it provides rudimentary permanent-storage capabilities.
* It attempts to be fast, portable, and small. The best algorithm
* to fulfill these goals is an internal probing hashing algorithm,
* as in Knuth, _Art of Computer Programming_, vol III. Unlike
* chained (open) hashing, it doesn't require a pointer for every
* item, yet it is still constant time lookup in practice.
*
* Also to save space, we let the contents (both data and key) that
* you insert be a union: if the key/data is small, we store it
* directly in the hashtable, otherwise we store a pointer to it.
* To keep you from having to figure out which, use KEY_PTR and
* PTR_KEY to convert between the arguments to these functions and
* a pointer to the real data. For instance:
* char key[] = "ab", *key2;
* HTItem *bck; HashTable *ht;
* HashInsert(ht, PTR_KEY(ht, key), 0);
* bck = HashFind(ht, PTR_KEY(ht, "ab"));
* key2 = KEY_PTR(ht, bck->key);
*
* There are a rich set of operations supported:
* AllocateHashTable() -- Allocates a hashtable structure and
* returns it.
* cchKey: if it's a positive number, then each key is a
* fixed-length record of that length. If it's 0,
* the key is assumed to be a \0-terminated string.
* fSaveKey: normally, you are responsible for allocating
* space for the key. If this is 1, we make a
* copy of the key for you.
* ClearHashTable() -- Removes everything from a hashtable
* FreeHashTable() -- Frees memory used by a hashtable
*
* HashFind() -- takes a key (use PTR_KEY) and returns the
* HTItem containing that key, or NULL if the
* key is not in the hashtable.
* HashFindLast() -- returns the item found by last HashFind()
* HashFindOrInsert() -- inserts the key/data pair if the key
* is not already in the hashtable, or
* returns the appropraite HTItem if it is.
* HashFindOrInsertItem() -- takes key/data as an HTItem.
* HashInsert() -- adds a key/data pair to the hashtable. What
* it does if the key is already in the table
* depends on the value of SAMEKEY_OVERWRITE.
* HashInsertItem() -- takes key/data as an HTItem.
* HashDelete() -- removes a key/data pair from the hashtable,
* if it's there. RETURNS 1 if it was there,
* 0 else.
* If you use sparse tables and never delete, the full data
* space is available. Otherwise we steal -2 (maybe -3),
* so you can't have data fields with those values.
* HashDeleteLast() -- deletes the item returned by the last Find().
*
* HashFirstBucket() -- used to iterate over the buckets in a
* hashtable. DON'T INSERT OR DELETE WHILE
* ITERATING! You can't nest iterations.
* HashNextBucket() -- RETURNS NULL at the end of iterating.
*
* HashSetDeltaGoalSize() -- if you're going to insert 1000 items
* at once, call this fn with arg 1000.
* It grows the table more intelligently.
*
* HashSave() -- saves the hashtable to a file. It saves keys ok,
* but it doesn't know how to interpret the data field,
* so if the data field is a pointer to some complex
* structure, you must send a function that takes a
* file pointer and a pointer to the structure, and
* write whatever you want to write. It should return
* the number of bytes written. If the file is NULL,
* it should just return the number of bytes it would
* write, without writing anything.
* If your data field is just an integer, not a
* pointer, just send NULL for the function.
* HashLoad() -- loads a hashtable. It needs a function that takes
* a file and the size of the structure, and expects
* you to read in the structure and return a pointer
* to it. You must do memory allocation, etc. If
* the data is just a number, send NULL.
* HashLoadKeys() -- unlike HashLoad(), doesn't load the data off disk
* until needed. This saves memory, but if you look
* up the same key a lot, it does a disk access each
* time.
* You can't do Insert() or Delete() on hashtables that were loaded
* from disk.
*/
#include <sys/types.h> /* includes definition of "ulong", we hope */
#define ulong u_long
#define MAGIC_KEY "CHsh" /* when we save the file */
#ifndef LOG_WORD_SIZE /* 5 for 32 bit words, 6 for 64 */
#if defined (__LP64__) || defined (_LP64)
#define LOG_WORD_SIZE 6 /* log_2(sizeof(ulong)) [in bits] */
#else
#define LOG_WORD_SIZE 5 /* log_2(sizeof(ulong)) [in bits] */
#endif
#endif
/* The following gives a speed/time tradeoff: how many buckets are *
* in each bin. 0 gives 32 buckets/bin, which is a good number. */
#ifndef LOG_BM_WORDS
#define LOG_BM_WORDS 0 /* each group has 2^L_B_W * 32 buckets */
#endif
/* The following are all parameters that affect performance. */
#ifndef JUMP
#define JUMP(key, offset) ( ++(offset) ) /* ( 1 ) for linear hashing */
#endif
#ifndef Table
#define Table(x) Sparse##x /* Dense##x for dense tables */
#endif
#ifndef FAST_DELETE
#define FAST_DELETE 0 /* if it's 1, we never shrink the ht */
#endif
#ifndef SAMEKEY_OVERWRITE
#define SAMEKEY_OVERWRITE 1 /* overwrite item with our key on insert? */
#endif
#ifndef OCCUPANCY_PCT
#define OCCUPANCY_PCT 0.5 /* large PCT means smaller and slower */
#endif
#ifndef MIN_HASH_SIZE
#define MIN_HASH_SIZE 512 /* ht size when first created */
#endif
/* When deleting a bucket, we can't just empty it (future hashes *
* may fail); instead we set the data field to DELETED. Thus you *
* should set DELETED to a data value you never use. Better yet, *
* if you don't need to delete, define INSERT_ONLY. */
#ifndef INSERT_ONLY
#define DELETED -2UL
#define IS_BCK_DELETED(bck) ( (bck) && (bck)->data == DELETED )
#define SET_BCK_DELETED(ht, bck) do { (bck)->data = DELETED; \
FREE_KEY(ht, (bck)->key); } while ( 0 )
#else
#define IS_BCK_DELETED(bck) 0
#define SET_BCK_DELETED(ht, bck) \
do { fprintf(stderr, "Deletion not supported for insert-only hashtable\n");\
exit(2); } while ( 0 )
#endif
/* We need the following only for dense buckets (Dense##x above). *
* If you need to, set this to a value you'll never use for data. */
#define EMPTY -3UL /* steal more of the bck->data space */
/* This is what an item is. Either can be cast to a pointer. */
typedef struct {
ulong data; /* 4 bytes for data: either a pointer or an integer */
ulong key; /* 4 bytes for the key: either a pointer or an int */
} HTItem;
struct Table(Bin); /* defined in chash.c, I hope */
struct Table(Iterator);
typedef struct Table(Bin) Table; /* Expands to SparseBin, etc */
typedef struct Table(Iterator) TableIterator;
/* for STORES_PTR to work ok, cchKey MUST BE DEFINED 1st, cItems 2nd! */
typedef struct HashTable {
ulong cchKey; /* the length of the key, or if it's \0 terminated */
ulong cItems; /* number of items currently in the hashtable */
ulong cDeletedItems; /* # of buckets holding DELETE in the hashtable */
ulong cBuckets; /* size of the table */
Table *table; /* The actual contents of the hashtable */
int fSaveKeys; /* 1 if we copy keys locally; 2 if keys in one block */
int cDeltaGoalSize; /* # of coming inserts (or deletes, if <0) we expect */
HTItem *posLastFind; /* position of last Find() command */
TableIterator *iter; /* used in First/NextBucket */
FILE *fpData; /* if non-NULL, what item->data points into */
char * (*dataRead)(FILE *, int); /* how to load data from disk */
HTItem bckData; /* holds data after being loaded from disk */
} HashTable;
/* Small keys are stored and passed directly, but large keys are
* stored and passed as pointers. To make it easier to remember
* what to pass, we provide two functions:
* PTR_KEY: give it a pointer to your data, and it returns
* something appropriate to send to Hash() functions or
* be stored in a data field.
* KEY_PTR: give it something returned by a Hash() routine, and
* it returns a (char *) pointer to the actual data.
*/
#define HashKeySize(ht) ( ((ulong *)(ht))[0] ) /* this is how we inline */
#define HashSize(ht) ( ((ulong *)(ht))[1] ) /* ...a la C++ :-) */
#define STORES_PTR(ht) ( HashKeySize(ht) == 0 || \
HashKeySize(ht) > sizeof(ulong) )
#define KEY_PTR(ht, key) ( STORES_PTR(ht) ? (char *)(key) : (char *)&(key) )
#ifdef DONT_HAVE_TO_WORRY_ABOUT_BUS_ERRORS
#define PTR_KEY(ht, ptr) ( STORES_PTR(ht) ? (ulong)(ptr) : *(ulong *)(ptr) )
#else
#define PTR_KEY(ht, ptr) ( STORES_PTR(ht) ? (ulong)(ptr) : HTcopy((char *)ptr))
#endif
/* Function prototypes */
unsigned long HTcopy(char *pul); /* for PTR_KEY, not for users */
struct HashTable *AllocateHashTable(int cchKey, int fSaveKeys);
void ClearHashTable(struct HashTable *ht);
void FreeHashTable(struct HashTable *ht);
HTItem *HashFind(struct HashTable *ht, ulong key);
HTItem *HashFindLast(struct HashTable *ht);
HTItem *HashFindOrInsert(struct HashTable *ht, ulong key, ulong dataInsert);
HTItem *HashFindOrInsertItem(struct HashTable *ht, HTItem *pItem);
HTItem *HashInsert(struct HashTable *ht, ulong key, ulong data);
HTItem *HashInsertItem(struct HashTable *ht, HTItem *pItem);
int HashDelete(struct HashTable *ht, ulong key);
int HashDeleteLast(struct HashTable *ht);
HTItem *HashFirstBucket(struct HashTable *ht);
HTItem *HashNextBucket(struct HashTable *ht);
int HashSetDeltaGoalSize(struct HashTable *ht, int delta);
void HashSave(FILE *fp, struct HashTable *ht, int (*write)(FILE *, char *));
struct HashTable *HashLoad(FILE *fp, char * (*read)(FILE *, int));
struct HashTable *HashLoadKeys(FILE *fp, char * (*read)(FILE *, int));

View File

@ -28,6 +28,7 @@
- IP Geolocation by <a href='https://db-ip.com'>DB-IP</a>\n\n
- AppIntro: <a href='https://github.com/AppIntro/AppIntro/blob/main/LICENSE'>Apache-2.0</a>\n\n
- QrGenerator: <a href='https://github.com/androidmads/QRGenerator/blob/master/LICENSE.md'>MIT</a>\n\n
- Sparsehash: <a href='https://github.com/sparsehash/sparsehash/blob/master/COPYING'>BSD-3-Clause</a>\n\n
- CIDRUtils: <a href='https://github.com/edazdarevic/CIDRUtils'>MIT</a>\n\n
- Font Awesome: <a href='https://fontawesome.com/license/free'>Licenses</a>\n\n
- App icon by <a href="https://www.freepik.com" title="Freepik">Freepik</a> from <a href="https://www.flaticon.com/" title="Flaticon">flaticon</a>\n\n