summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <m@maandree.se>2026-06-04 21:09:20 +0200
committerMattias Andrée <m@maandree.se>2026-06-04 21:12:18 +0200
commit6a46c1c208d5799f15b866555becae0b074f9ebe (patch)
treee49d2136ed33e72d6300f013bc482ab9df37ccc9
parentFirst commit (diff)
downloadblkseekr-6a46c1c208d5799f15b866555becae0b074f9ebe.tar.gz
blkseekr-6a46c1c208d5799f15b866555becae0b074f9ebe.tar.bz2
blkseekr-6a46c1c208d5799f15b866555becae0b074f9ebe.tar.xz
Second commit
Signed-off-by: Mattias Andrée <m@maandree.se>
-rw-r--r--blkseekr.c293
-rw-r--r--config.mk2
2 files changed, 216 insertions, 79 deletions
diff --git a/blkseekr.c b/blkseekr.c
index 0f1c199..8e42289 100644
--- a/blkseekr.c
+++ b/blkseekr.c
@@ -2,101 +2,204 @@
#include <libsimple-arg.h>
#include <libsimple.h>
#include <regex.h>
+#include <libautomata.h>
-USAGE("[-b block-size] [-h] ([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... "
+USAGE("[-b block-size] [-B buffer-size] [-h] "
+ "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... "
"file [first-block [last-block]]");
/*
* TODO on SIGINT stop and print last completed block
- * TODO use madvise, posix_advise or readahead
- * TODO allocate addition blocks to reduce memory copying
+ * TODO use madvise, posix_advise or readahead (?)
* TODO show progress
* TODO make program that extracts (binary) or displays (hex + text) the blocks
* TODO add support for NUL support in regex
+ * TODO add support for hexadecimally encoded text (-X)
*/
-struct memstring {
- const char *text;
+enum automata_type {
+ FIXED,
+ FIXED_ICASE,
+ REGEX
+};
+
+struct fixed {
+ LIBAUTOMATA_KMP_AUTOMATON *kmp;
size_t len;
- int icase;
};
+struct automata {
+ enum automata_type type;
+ union {
+ struct fixed fixed;
+ regex_t regex;
+ } a;
+};
-static struct memstring *texts = NULL;
-static size_t ntexts = 0;
-static regex_t *automata = NULL;
-static size_t nautomata = 0;
+
+static struct automata *automata = NULL;
+static size_t nautomata = 0u;
static int analyse_holes = 0;
static off_t position;
+static size_t blksize = 4ul << 10;
+static size_t bufsize = 8ul << 20;
+static char *lower_text = NULL;
+static size_t lower_text_size = 0u;
+
+
+static void
+report(off_t beginning)
+{
+ printf("Found expression beginning at byte %ju\n", (uintmax_t)beginning);
+ fflush(stdout);
+}
+
+
+static void
+analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct fixed *automaton)
+{
+ size_t extra, offset = 0;
+ char *end, *beginning;
+
+ extra = MIN(available - start_end, automaton->len);
+
+ while (offset < start_end) {
+ end = libautomata_execute_kmp_automaton(automaton->kmp, &block[offset], start_end - offset + extra);
+ if (!end)
+ break;
+ beginning = end - automaton->len;
+ offset = (size_t)(beginning - block);
+ report(position + (off_t)offset);
+ offset += 1u;
+ }
+}
static void
-analyse(char *block, size_t blksize, size_t available) /* TODO */
+analyse_block_with_regex(char *block, size_t start_end, size_t available, regex_t *automaton)
{
+ /* TODO */
+}
+
+
+static void
+analyse_block(char *block, size_t start_end, size_t available)
+{
+ size_t i, j;
+ int lowered = 0;
+ for (i = 0u; i < nautomata; i++) {
+ switch (automata[i].type) {
+ case FIXED_ICASE:
+ if (!lowered) {
+ lowered = 1;
+ if (available > lower_text_size)
+ lower_text = erealloc(lower_text, lower_text_size = available);
+ for (j = 0u; j < available; j++) {
+ if ('A' <= block[i] && block[i] <= 'Z')
+ lower_text[i] = (char)(block[i] ^ ('a' ^ 'A'));
+ else
+ lower_text[i] = block[i];
+ }
+ }
+ analyse_block_with_fixed(lower_text, start_end, available, &automata[i].a.fixed);
+ break;
+ case FIXED:
+ analyse_block_with_fixed(block, start_end, available, &automata[i].a.fixed);
+ break;
+ case REGEX:
+ analyse_block_with_regex(block, start_end, available, &automata[i].a.regex);
+ break;
+ default:
+ abort();
+ }
+ }
}
static size_t
-encountered_data(char *block, size_t blksize, size_t available)
+encountered_data(char *text, size_t available)
{
- if (available < blksize * 2u)
+ /* Wait until the buffer is full */
+ if (available < bufsize)
return available;
- analyse(block, blksize, available);
+ /* Process the buffer */
+ analyse_block(text, bufsize - blksize, available);
+ position += (off_t)(bufsize - blksize);
- memcpy(&block[0], &block[blksize], blksize);
- position += (off_t)blksize;
+ /* Save last block to support patterns spanning two blocks */
+ memcpy(&text[0], &text[bufsize - blksize], blksize);
return blksize;
}
static size_t
-encountered_hole(char *block, size_t blksize, size_t available, off_t size)
+encountered_hole(char *text, size_t available, off_t size)
{
- off_t q = size / (off_t)blksize;
- off_t r = size % (off_t)blksize;
- size_t n;
+ off_t r;
- if (q) {
- q--;
- r += (off_t)blksize;
+ /* If there are at least two blocks of data buffered,
+ * process all but the last one (the last one have to
+ * be skipped the processing requires two blocks) */
+ if (available >= blksize * 2u) {
+ size_t full_avail = available - (available % blksize);
+ size_t start_end = available - blksize;
+ analyse_block(text, start_end, full_avail);
+ position += (off_t)start_end;
+ available -= start_end;
+ memcpy(&text[0], &text[start_end], available - start_end);
}
- while (r || q) {
- n = blksize * 2u - available;
- n = MIN(n, (size_t)r);
- memset(&block[available], 0, n);
+ /* If there is data buffered remaining, fill to two
+ * blocks with null bytes and analyse that data */
+ if (available) {
+ size_t n = blksize * 2u - available;
+ if ((uintmax_t)n > (uintmax_t)size)
+ n = (size_t)size;
+ memset(&text[available], 0, n);
available += n;
- r -= (off_t)n;
+ if (available < blksize * 2u)
+ return available;
+ analyse_block(text, available - blksize, available);
+ position += (off_t)(available - blksize);
+ size -= (off_t)n;
+ available = blksize;
+ }
- available = encountered_data(block, blksize, available);
- if (q) {
- q--;
- memset(&block[available], 0, blksize);
- available = encountered_data(block, blksize, available);
- }
- if (q) {
- q--;
- memset(&block[available], 0, blksize);
- available = encountered_data(block, blksize, available);
- }
- if (analyse_holes) {
- for (; q; q--)
- available = encountered_data(block, blksize, available);
- }
+ memset(text, 0, bufsize);
+ size += (off_t)available;
+
+ /* If we are not analyzing holes, skip pass the full
+ * blocks and the full the buffer with as many null
+ * bytes as there are bytes in the hole in excess of
+ * full blocks */
+ if (!analyse_holes) {
+ r = size % (off_t)blksize;
+ position += size - r;
+ return (size_t)r;
}
- return available;
+ /* But if we are analysing holes... */
+ while (size >= (off_t)(blksize * 2u)) {
+ size_t max = (size_t)MIN((uintmax_t)size, (uintmax_t)bufsize);
+ size_t full_avail = max - (max % blksize);
+ size_t start_end = max - blksize;
+ analyse_block(text, start_end, full_avail);
+ position += (off_t)start_end;
+ size -= (off_t)start_end;
+ }
+
+ return (size_t)size;
}
static void
-encountered_eof(char *block, size_t blksize, size_t available)
+encountered_eof(char *text, size_t available)
{
- analyse(block, blksize, available);
+ analyse_block(text, available, available);
}
@@ -113,7 +216,7 @@ eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
regerror_again:
r = regerror(e, preg, buf, buf_size);
if (r > buf_size) {
- buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2);
+ buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2u);
goto regerror_again;
}
eprintf("regcomp %s: %s", regex, buf);
@@ -126,33 +229,51 @@ int
main(int argc, char *argv[])
{
const char *path;
- char *block;
+ const char *pattern;
+ char *pattern_free;
+ char *text;
size_t available = 0u;
int regex_flags = REG_NOSUB;
int have_unused_flags = 0;
- size_t blksize = 4ul << 10;
off_t first_block = 0;
off_t last_block = -1;
off_t data, hole;
int skip_holes = 1;
int fd;
struct stat st;
+ size_t i;
ARGBEGIN {
case 'r':
case 'e':
have_unused_flags = 0;
automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata));
- eregcomp(&automata[nautomata++], ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0));
+ eregcomp(&automata[nautomata].a.regex, ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0));
+ automata[nautomata].type = REGEX;
+ nautomata++;
break;
case 'x':
have_unused_flags = 0;
- texts = ereallocarray(texts, ntexts + 1u, sizeof(*texts));
- texts[ntexts].text = ARG();
- texts[ntexts].len = strlen(texts[ntexts].text);
- texts[ntexts].icase = !!(regex_flags & REG_ICASE);
- ntexts++;
+ automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata));
+ pattern = ARG();
+ if (regex_flags & REG_ICASE) {
+ automata[nautomata].type = FIXED_ICASE;
+ pattern_free = estrdup(pattern);
+ for (i = 0u; pattern_free[i]; i++)
+ if ('A' <= pattern_free[i] && pattern_free[i] <= 'Z')
+ pattern_free[i] ^= (char)('a' ^ 'A');
+ pattern = pattern_free;
+ } else {
+ automata[nautomata].type = FIXED;
+ pattern_free = NULL;
+ }
+ automata[nautomata].a.fixed.len = strlen(pattern);
+ automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, strlen(pattern), sizeof(char));
+ if (!automata[nautomata].a.fixed.kmp)
+ eprintf("libautomata_compile_kmp_automaton:");
+ nautomata++;
+ free(pattern_free);
break;
case 'h':
@@ -160,16 +281,20 @@ main(int argc, char *argv[])
break;
case 'b':
- /* TODO set `blksize` */
+ /* TODO set blksize */
break;
- case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break;
- case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break;
+ case 'B':
+ /* TODO set bufsize */
+ break;
+
+ case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break;
+ case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break;
default:
usage();
} ARGALT('+') {
- case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break;
- case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break;
+ case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break;
+ case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break;
default:
usage();
} ARGEND;
@@ -179,16 +304,24 @@ main(int argc, char *argv[])
path = argv[0];
- if (argc >= 2)
- ; /* TODO set first_block */
+ if (argc >= 2) {
+ /* TODO set first_block */
+ }
if (argc >= 3) {
- ; /* TODO set last_block */
+ /* TODO set last_block */
if (last_block < first_block)
usage();
}
- block = emalloc(blksize * 2u);
+ if (blksize > (size_t)(SSIZE_MAX / 4))
+ eprintf("selected blocksize is too large");
+ if (bufsize > (size_t)SSIZE_MAX)
+ bufsize = (size_t)SSIZE_MAX;
+ bufsize = bufsize / blksize;
+ bufsize = MAX(bufsize, 2);
+ bufsize *= blksize;
+ text = emalloc(bufsize);
first_block *= (off_t)blksize;
if (last_block >= 0) {
last_block += 1;
@@ -210,8 +343,8 @@ main(int argc, char *argv[])
if (first_block && lseek(fd, first_block, SEEK_SET) == (off_t)-1) {
off_t remaining = first_block;
while (remaining) {
- uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)(blksize * 2u));
- ssize_t r = read(fd, block, (size_t)n);
+ uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)bufsize);
+ ssize_t r = read(fd, text, (size_t)n);
if (r <= 0) {
if (!r)
goto out;
@@ -236,7 +369,7 @@ main(int argc, char *argv[])
else
eprintf("lseek %s %ji SEEK_DATA", path, (intmax_t)first_block);
} else {
- available = encountered_hole(block, blksize, available, data - first_block);
+ available = encountered_hole(text, available, data - first_block);
}
} else {
data = first_block;
@@ -263,9 +396,9 @@ main(int argc, char *argv[])
/* Analyse the data */
while (data < hole) {
off_t n_off = hole - data;
- size_t n_size = blksize * 2u - available;
+ size_t n_size = bufsize - available;
size_t n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size);
- ssize_t r = read(fd, &block[available], n);
+ ssize_t r = read(fd, &text[available], n);
if (r <= 0) {
if (!r)
goto eof;
@@ -274,24 +407,28 @@ main(int argc, char *argv[])
eprintf("read %s:", path);
}
available += (size_t)r;
- if (available == blksize * 2u)
- available = encountered_data(block, blksize, available);
+ if (available == bufsize)
+ available = encountered_data(text, available);
}
}
eof:
/* Analyse end of file */
- available = encountered_data(block, blksize, available);
+ available = encountered_data(text, available);
if (skip_holes && first_block < st.st_size)
- available = encountered_hole(block, blksize, available, st.st_size - first_block);
- encountered_eof(block, blksize, available);
+ available = encountered_hole(text, available, st.st_size - first_block);
+ encountered_eof(text, available);
out:
close(fd);
- while (nautomata)
- regfree(&automata[--nautomata]);
+ while (nautomata--) {
+ if (automata[nautomata].type == REGEX)
+ regfree(&automata[nautomata].a.regex);
+ else
+ free(automata[nautomata].a.fixed.kmp);
+ }
free(automata);
- free(texts);
- free(block);
+ free(text);
+ free(lower_text);
return 0;
}
diff --git a/config.mk b/config.mk
index aeb8cec..ee7fff4 100644
--- a/config.mk
+++ b/config.mk
@@ -5,4 +5,4 @@ CC = c99
CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
CFLAGS =
-LDFLAGS = -lsimple
+LDFLAGS = -lsimple -lautomata