From 6a46c1c208d5799f15b866555becae0b074f9ebe Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Thu, 4 Jun 2026 21:09:20 +0200 Subject: Second commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- blkseekr.c | 295 ++++++++++++++++++++++++++++++++++++++++++++----------------- config.mk | 2 +- 2 files changed, 217 insertions(+), 80 deletions(-) diff --git a/blkseekr.c b/blkseekr.c index 0f1c199..8e42289 100644 --- a/blkseekr.c +++ b/blkseekr.c @@ -2,101 +2,204 @@ #include #include #include +#include -USAGE("[-b block-size] [-h] ([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... " +USAGE("[-b block-size] [-B buffer-size] [-h] " + "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... " "file [first-block [last-block]]"); /* * TODO on SIGINT stop and print last completed block - * TODO use madvise, posix_advise or readahead - * TODO allocate addition blocks to reduce memory copying + * TODO use madvise, posix_advise or readahead (?) * TODO show progress * TODO make program that extracts (binary) or displays (hex + text) the blocks * TODO add support for NUL support in regex + * TODO add support for hexadecimally encoded text (-X) */ -struct memstring { - const char *text; +enum automata_type { + FIXED, + FIXED_ICASE, + REGEX +}; + +struct fixed { + LIBAUTOMATA_KMP_AUTOMATON *kmp; size_t len; - int icase; }; +struct automata { + enum automata_type type; + union { + struct fixed fixed; + regex_t regex; + } a; +}; -static struct memstring *texts = NULL; -static size_t ntexts = 0; -static regex_t *automata = NULL; -static size_t nautomata = 0; + +static struct automata *automata = NULL; +static size_t nautomata = 0u; static int analyse_holes = 0; static off_t position; +static size_t blksize = 4ul << 10; +static size_t bufsize = 8ul << 20; +static char *lower_text = NULL; +static size_t lower_text_size = 0u; + + +static void +report(off_t beginning) +{ + printf("Found expression beginning at byte %ju\n", (uintmax_t)beginning); + fflush(stdout); +} static void -analyse(char *block, size_t blksize, size_t available) /* TODO */ +analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct fixed *automaton) { + size_t extra, offset = 0; + char *end, *beginning; + + extra = MIN(available - start_end, automaton->len); + + while (offset < start_end) { + end = libautomata_execute_kmp_automaton(automaton->kmp, &block[offset], start_end - offset + extra); + if (!end) + break; + beginning = end - automaton->len; + offset = (size_t)(beginning - block); + report(position + (off_t)offset); + offset += 1u; + } +} + + +static void +analyse_block_with_regex(char *block, size_t start_end, size_t available, regex_t *automaton) +{ + /* TODO */ +} + + +static void +analyse_block(char *block, size_t start_end, size_t available) +{ + size_t i, j; + int lowered = 0; + for (i = 0u; i < nautomata; i++) { + switch (automata[i].type) { + case FIXED_ICASE: + if (!lowered) { + lowered = 1; + if (available > lower_text_size) + lower_text = erealloc(lower_text, lower_text_size = available); + for (j = 0u; j < available; j++) { + if ('A' <= block[i] && block[i] <= 'Z') + lower_text[i] = (char)(block[i] ^ ('a' ^ 'A')); + else + lower_text[i] = block[i]; + } + } + analyse_block_with_fixed(lower_text, start_end, available, &automata[i].a.fixed); + break; + case FIXED: + analyse_block_with_fixed(block, start_end, available, &automata[i].a.fixed); + break; + case REGEX: + analyse_block_with_regex(block, start_end, available, &automata[i].a.regex); + break; + default: + abort(); + } + } } static size_t -encountered_data(char *block, size_t blksize, size_t available) +encountered_data(char *text, size_t available) { - if (available < blksize * 2u) + /* Wait until the buffer is full */ + if (available < bufsize) return available; - analyse(block, blksize, available); + /* Process the buffer */ + analyse_block(text, bufsize - blksize, available); + position += (off_t)(bufsize - blksize); - memcpy(&block[0], &block[blksize], blksize); - position += (off_t)blksize; + /* Save last block to support patterns spanning two blocks */ + memcpy(&text[0], &text[bufsize - blksize], blksize); return blksize; } static size_t -encountered_hole(char *block, size_t blksize, size_t available, off_t size) +encountered_hole(char *text, size_t available, off_t size) { - off_t q = size / (off_t)blksize; - off_t r = size % (off_t)blksize; - size_t n; - - if (q) { - q--; - r += (off_t)blksize; + off_t r; + + /* If there are at least two blocks of data buffered, + * process all but the last one (the last one have to + * be skipped the processing requires two blocks) */ + if (available >= blksize * 2u) { + size_t full_avail = available - (available % blksize); + size_t start_end = available - blksize; + analyse_block(text, start_end, full_avail); + position += (off_t)start_end; + available -= start_end; + memcpy(&text[0], &text[start_end], available - start_end); } - while (r || q) { - n = blksize * 2u - available; - n = MIN(n, (size_t)r); - memset(&block[available], 0, n); + /* If there is data buffered remaining, fill to two + * blocks with null bytes and analyse that data */ + if (available) { + size_t n = blksize * 2u - available; + if ((uintmax_t)n > (uintmax_t)size) + n = (size_t)size; + memset(&text[available], 0, n); available += n; - r -= (off_t)n; + if (available < blksize * 2u) + return available; + analyse_block(text, available - blksize, available); + position += (off_t)(available - blksize); + size -= (off_t)n; + available = blksize; + } - available = encountered_data(block, blksize, available); - if (q) { - q--; - memset(&block[available], 0, blksize); - available = encountered_data(block, blksize, available); - } - if (q) { - q--; - memset(&block[available], 0, blksize); - available = encountered_data(block, blksize, available); - } - if (analyse_holes) { - for (; q; q--) - available = encountered_data(block, blksize, available); - } + memset(text, 0, bufsize); + size += (off_t)available; + + /* If we are not analyzing holes, skip pass the full + * blocks and the full the buffer with as many null + * bytes as there are bytes in the hole in excess of + * full blocks */ + if (!analyse_holes) { + r = size % (off_t)blksize; + position += size - r; + return (size_t)r; } - return available; + /* But if we are analysing holes... */ + while (size >= (off_t)(blksize * 2u)) { + size_t max = (size_t)MIN((uintmax_t)size, (uintmax_t)bufsize); + size_t full_avail = max - (max % blksize); + size_t start_end = max - blksize; + analyse_block(text, start_end, full_avail); + position += (off_t)start_end; + size -= (off_t)start_end; + } + + return (size_t)size; } static void -encountered_eof(char *block, size_t blksize, size_t available) +encountered_eof(char *text, size_t available) { - analyse(block, blksize, available); + analyse_block(text, available, available); } @@ -113,7 +216,7 @@ eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags) regerror_again: r = regerror(e, preg, buf, buf_size); if (r > buf_size) { - buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2); + buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2u); goto regerror_again; } eprintf("regcomp %s: %s", regex, buf); @@ -126,33 +229,51 @@ int main(int argc, char *argv[]) { const char *path; - char *block; + const char *pattern; + char *pattern_free; + char *text; size_t available = 0u; int regex_flags = REG_NOSUB; int have_unused_flags = 0; - size_t blksize = 4ul << 10; off_t first_block = 0; off_t last_block = -1; off_t data, hole; int skip_holes = 1; int fd; struct stat st; + size_t i; ARGBEGIN { case 'r': case 'e': have_unused_flags = 0; automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); - eregcomp(&automata[nautomata++], ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0)); + eregcomp(&automata[nautomata].a.regex, ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0)); + automata[nautomata].type = REGEX; + nautomata++; break; case 'x': have_unused_flags = 0; - texts = ereallocarray(texts, ntexts + 1u, sizeof(*texts)); - texts[ntexts].text = ARG(); - texts[ntexts].len = strlen(texts[ntexts].text); - texts[ntexts].icase = !!(regex_flags & REG_ICASE); - ntexts++; + automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); + pattern = ARG(); + if (regex_flags & REG_ICASE) { + automata[nautomata].type = FIXED_ICASE; + pattern_free = estrdup(pattern); + for (i = 0u; pattern_free[i]; i++) + if ('A' <= pattern_free[i] && pattern_free[i] <= 'Z') + pattern_free[i] ^= (char)('a' ^ 'A'); + pattern = pattern_free; + } else { + automata[nautomata].type = FIXED; + pattern_free = NULL; + } + automata[nautomata].a.fixed.len = strlen(pattern); + automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, strlen(pattern), sizeof(char)); + if (!automata[nautomata].a.fixed.kmp) + eprintf("libautomata_compile_kmp_automaton:"); + nautomata++; + free(pattern_free); break; case 'h': @@ -160,16 +281,20 @@ main(int argc, char *argv[]) break; case 'b': - /* TODO set `blksize` */ + /* TODO set blksize */ break; - case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break; - case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break; + case 'B': + /* TODO set bufsize */ + break; + + case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break; + case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break; default: usage(); } ARGALT('+') { - case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break; - case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break; + case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break; + case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break; default: usage(); } ARGEND; @@ -179,16 +304,24 @@ main(int argc, char *argv[]) path = argv[0]; - if (argc >= 2) - ; /* TODO set first_block */ + if (argc >= 2) { + /* TODO set first_block */ + } if (argc >= 3) { - ; /* TODO set last_block */ + /* TODO set last_block */ if (last_block < first_block) usage(); } - block = emalloc(blksize * 2u); + if (blksize > (size_t)(SSIZE_MAX / 4)) + eprintf("selected blocksize is too large"); + if (bufsize > (size_t)SSIZE_MAX) + bufsize = (size_t)SSIZE_MAX; + bufsize = bufsize / blksize; + bufsize = MAX(bufsize, 2); + bufsize *= blksize; + text = emalloc(bufsize); first_block *= (off_t)blksize; if (last_block >= 0) { last_block += 1; @@ -210,8 +343,8 @@ main(int argc, char *argv[]) if (first_block && lseek(fd, first_block, SEEK_SET) == (off_t)-1) { off_t remaining = first_block; while (remaining) { - uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)(blksize * 2u)); - ssize_t r = read(fd, block, (size_t)n); + uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)bufsize); + ssize_t r = read(fd, text, (size_t)n); if (r <= 0) { if (!r) goto out; @@ -236,7 +369,7 @@ main(int argc, char *argv[]) else eprintf("lseek %s %ji SEEK_DATA", path, (intmax_t)first_block); } else { - available = encountered_hole(block, blksize, available, data - first_block); + available = encountered_hole(text, available, data - first_block); } } else { data = first_block; @@ -263,9 +396,9 @@ main(int argc, char *argv[]) /* Analyse the data */ while (data < hole) { off_t n_off = hole - data; - size_t n_size = blksize * 2u - available; + size_t n_size = bufsize - available; size_t n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size); - ssize_t r = read(fd, &block[available], n); + ssize_t r = read(fd, &text[available], n); if (r <= 0) { if (!r) goto eof; @@ -274,24 +407,28 @@ main(int argc, char *argv[]) eprintf("read %s:", path); } available += (size_t)r; - if (available == blksize * 2u) - available = encountered_data(block, blksize, available); + if (available == bufsize) + available = encountered_data(text, available); } } eof: /* Analyse end of file */ - available = encountered_data(block, blksize, available); + available = encountered_data(text, available); if (skip_holes && first_block < st.st_size) - available = encountered_hole(block, blksize, available, st.st_size - first_block); - encountered_eof(block, blksize, available); + available = encountered_hole(text, available, st.st_size - first_block); + encountered_eof(text, available); out: close(fd); - while (nautomata) - regfree(&automata[--nautomata]); + while (nautomata--) { + if (automata[nautomata].type == REGEX) + regfree(&automata[nautomata].a.regex); + else + free(automata[nautomata].a.fixed.kmp); + } free(automata); - free(texts); - free(block); + free(text); + free(lower_text); return 0; } diff --git a/config.mk b/config.mk index aeb8cec..ee7fff4 100644 --- a/config.mk +++ b/config.mk @@ -5,4 +5,4 @@ CC = c99 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE CFLAGS = -LDFLAGS = -lsimple +LDFLAGS = -lsimple -lautomata -- cgit v1.3.1