diff options
| author | Mattias Andrée <m@maandree.se> | 2026-06-04 22:42:16 +0200 |
|---|---|---|
| committer | Mattias Andrée <m@maandree.se> | 2026-06-04 22:42:16 +0200 |
| commit | 19cc5d523e5a9feeab5a36ce779ddbfda0717b5a (patch) | |
| tree | 72c3b1904fd1626f802d7c8c93f8cfa2eff4fcae /blkseekr.c | |
| parent | Second commit (diff) | |
| download | blkseekr-19cc5d523e5a9feeab5a36ce779ddbfda0717b5a.tar.gz blkseekr-19cc5d523e5a9feeab5a36ce779ddbfda0717b5a.tar.bz2 blkseekr-19cc5d523e5a9feeab5a36ce779ddbfda0717b5a.tar.xz | |
Third commit
Signed-off-by: Mattias Andrée <m@maandree.se>
Diffstat (limited to 'blkseekr.c')
| -rw-r--r-- | blkseekr.c | 221 |
1 files changed, 169 insertions, 52 deletions
@@ -5,17 +5,15 @@ #include <libautomata.h> USAGE("[-b block-size] [-B buffer-size] [-h] " - "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... " + "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text | -X hextext)) ... " "file [first-block [last-block]]"); /* * TODO on SIGINT stop and print last completed block - * TODO use madvise, posix_advise or readahead (?) * TODO show progress * TODO make program that extracts (binary) or displays (hex + text) the blocks - * TODO add support for NUL support in regex - * TODO add support for hexadecimally encoded text (-X) + * TODO add support for NUL support in regex (once supported in libautomata) */ @@ -47,13 +45,21 @@ static size_t blksize = 4ul << 10; static size_t bufsize = 8ul << 20; static char *lower_text = NULL; static size_t lower_text_size = 0u; +static char *blkmarks; +static size_t nblkmarks; +static size_t low_mark; +static size_t high_mark; static void -report(off_t beginning) +mark(off_t beginning) { - printf("Found expression beginning at byte %ju\n", (uintmax_t)beginning); - fflush(stdout); + size_t mark = (size_t)(beginning - position) / blksize; + blkmarks[mark] = 1; + if (low_mark > mark) + low_mark = mark; + if (high_mark < mark) + high_mark = mark; } @@ -71,7 +77,7 @@ analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct break; beginning = end - automaton->len; offset = (size_t)(beginning - block); - report(position + (off_t)offset); + mark(position + (off_t)offset); offset += 1u; } } @@ -80,40 +86,68 @@ analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct static void analyse_block_with_regex(char *block, size_t start_end, size_t available, regex_t *automaton) { - /* TODO */ + size_t offset = 0u; + regmatch_t match; + int r; + + while (offset < start_end) { + match.rm_so = (regoff_t)offset; + match.rm_eo = (regoff_t)available; + r = regexec(automaton, block, 1u, &match, REG_STARTEND); + if (r == REG_NOMATCH) + break; + if (r) + eprintf("regexec:"); + if ((size_t)match.rm_so >= start_end) + break; + mark(position + (off_t)match.rm_so); + offset = (size_t)match.rm_so + 1u; + } } static void analyse_block(char *block, size_t start_end, size_t available) { - size_t i, j; + size_t i = 0u, off = 0u, blkoff = 0u; int lowered = 0; - for (i = 0u; i < nautomata; i++) { + + memset(blkmarks, 0, nblkmarks); + low_mark = SIZE_MAX; + high_mark = 0u; + + for (;;) { switch (automata[i].type) { case FIXED_ICASE: if (!lowered) { lowered = 1; - if (available > lower_text_size) - lower_text = erealloc(lower_text, lower_text_size = available); - for (j = 0u; j < available; j++) { - if ('A' <= block[i] && block[i] <= 'Z') - lower_text[i] = (char)(block[i] ^ ('a' ^ 'A')); - else - lower_text[i] = block[i]; - } + if (available - off > lower_text_size) + lower_text = erealloc(lower_text, lower_text_size = available - off); + libsimple_memtolower(&lower_text[off], &block[off], available - off); } - analyse_block_with_fixed(lower_text, start_end, available, &automata[i].a.fixed); + analyse_block_with_fixed(&lower_text[off], start_end - off, available - off, &automata[i].a.fixed); break; case FIXED: - analyse_block_with_fixed(block, start_end, available, &automata[i].a.fixed); + analyse_block_with_fixed(&block[off], start_end - off, available - off, &automata[i].a.fixed); break; case REGEX: - analyse_block_with_regex(block, start_end, available, &automata[i].a.regex); + analyse_block_with_regex(&block[off], start_end - off, available - off, &automata[i].a.regex); break; default: abort(); } + + if (++i == nautomata) + break; + + if (blkoff == low_mark) + for (; blkoff <= high_mark && blkmarks[blkoff]; blkoff++) + off += blksize; + } + + for (i = 0u; i < nblkmarks; i++) { + printf("Found expression beginning in block %ju\n", (uintmax_t)position / blksize + i); + fflush(stdout); } } @@ -146,11 +180,11 @@ encountered_hole(char *text, size_t available, off_t size) * be skipped the processing requires two blocks) */ if (available >= blksize * 2u) { size_t full_avail = available - (available % blksize); - size_t start_end = available - blksize; + size_t start_end = full_avail - blksize; analyse_block(text, start_end, full_avail); position += (off_t)start_end; available -= start_end; - memcpy(&text[0], &text[start_end], available - start_end); + memcpy(&text[0], &text[start_end], available); } /* If there is data buffered remaining, fill to two @@ -186,7 +220,7 @@ encountered_hole(char *text, size_t available, off_t size) while (size >= (off_t)(blksize * 2u)) { size_t max = (size_t)MIN((uintmax_t)size, (uintmax_t)bufsize); size_t full_avail = max - (max % blksize); - size_t start_end = max - blksize; + size_t start_end = full_avail - blksize; analyse_block(text, start_end, full_avail); position += (off_t)start_end; size -= (off_t)start_end; @@ -225,6 +259,50 @@ eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags) } +#if defined(__GNUC__) +__attribute__((__pure__)) +#endif +static uintmax_t +parse_uint_arg(const char *s, uintmax_t min, uintmax_t max) +{ + uintmax_t v = 0u, d; + if (!*s) + usage(); + while ('0' <= *s && *s <= '9') { + d = (uintmax_t)(*s++ - '0'); + if (d > 9u || v > (max - d) / 10u) /* we know that max>d */ + usage(); + v = v * 10u + v; + } + if (*s || v < min) + usage(); + return v; +} + + +static void +decode_hex(char *bin, const char *hex, size_t n) +{ + size_t i, j, end; + uint8_t value; + for (i = 0u, j = 0u; i < n; i += 1u) { + value = 0u; + for (end = j + 2u; j < end; j++) { + value <<= 4; + if ('0' <= hex[j] && hex[j] <= '9') + value |= (uint8_t)(hex[j] - '0'); + else if ('a' <= hex[j] && hex[j] <= 'f') + value |= (uint8_t)(hex[j] - 'a' + 10); + else if ('A' <= hex[j] && hex[j] <= 'F') + value |= (uint8_t)(hex[j] - 'A' + 10); + else + usage(); + } + bin[i] = (char)value; + } +} + + int main(int argc, char *argv[]) { @@ -232,7 +310,7 @@ main(int argc, char *argv[]) const char *pattern; char *pattern_free; char *text; - size_t available = 0u; + size_t len, available = 0u; int regex_flags = REG_NOSUB; int have_unused_flags = 0; off_t first_block = 0; @@ -241,7 +319,6 @@ main(int argc, char *argv[]) int skip_holes = 1; int fd; struct stat st; - size_t i; ARGBEGIN { case 'r': @@ -257,19 +334,42 @@ main(int argc, char *argv[]) have_unused_flags = 0; automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); pattern = ARG(); + automata[nautomata].a.fixed.len = len = strlen(pattern); + if (!len) + usage(); if (regex_flags & REG_ICASE) { automata[nautomata].type = FIXED_ICASE; pattern_free = estrdup(pattern); - for (i = 0u; pattern_free[i]; i++) - if ('A' <= pattern_free[i] && pattern_free[i] <= 'Z') - pattern_free[i] ^= (char)('a' ^ 'A'); + libsimple_memtolower(pattern_free, pattern_free, len); pattern = pattern_free; } else { automata[nautomata].type = FIXED; pattern_free = NULL; } - automata[nautomata].a.fixed.len = strlen(pattern); - automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, strlen(pattern), sizeof(char)); + automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, len, sizeof(char)); + if (!automata[nautomata].a.fixed.kmp) + eprintf("libautomata_compile_kmp_automaton:"); + nautomata++; + free(pattern_free); + break; + + case 'X': + have_unused_flags = 0; + automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); + pattern = ARG(); + len = strlen(pattern); + if (!len || (len & 1u)) + usage(); + automata[nautomata].a.fixed.len = len /= 2u; + pattern_free = emalloc(len); + decode_hex(pattern_free, pattern, len); + if (regex_flags & REG_ICASE) { + automata[nautomata].type = FIXED_ICASE; + libsimple_memtolower(pattern_free, pattern_free, len); + } else { + automata[nautomata].type = FIXED; + } + automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern_free, len, sizeof(char)); if (!automata[nautomata].a.fixed.kmp) eprintf("libautomata_compile_kmp_automaton:"); nautomata++; @@ -281,11 +381,11 @@ main(int argc, char *argv[]) break; case 'b': - /* TODO set blksize */ + blksize = (size_t)parse_uint_arg(ARG(), 1u, SIZE_MAX); break; case 'B': - /* TODO set bufsize */ + bufsize = (size_t)parse_uint_arg(ARG(), 1u, SIZE_MAX); break; case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break; @@ -304,35 +404,42 @@ main(int argc, char *argv[]) path = argv[0]; - if (argc >= 2) { - /* TODO set first_block */ - } - - if (argc >= 3) { - /* TODO set last_block */ - if (last_block < first_block) - usage(); - } + if (argc >= 2) + first_block = (off_t)parse_uint_arg(argv[1], 1u, (uintmax_t)OFF_MAX); + if (argc >= 3) + last_block = (off_t)parse_uint_arg(argv[2], (uintmax_t)first_block, (uintmax_t)OFF_MAX); if (blksize > (size_t)(SSIZE_MAX / 4)) eprintf("selected blocksize is too large"); if (bufsize > (size_t)SSIZE_MAX) bufsize = (size_t)SSIZE_MAX; - bufsize = bufsize / blksize; - bufsize = MAX(bufsize, 2); - bufsize *= blksize; + nblkmarks = bufsize / blksize; + nblkmarks = MAX(nblkmarks, 2); + bufsize = nblkmarks * blksize; text = emalloc(bufsize); first_block *= (off_t)blksize; if (last_block >= 0) { last_block += 1; last_block *= (off_t)blksize; } + blkmarks = emalloc(nblkmarks); /* Open file to analyse */ fd = open(path, O_RDONLY); if (fd < 0) eprintf("open %s O_RDONLY", path); + /* Advise the kernel about the access pattern */ + posix_fadvise(fd, 0, first_block, POSIX_FADV_DONTNEED); + if (last_block >= 0) { + posix_fadvise(fd, last_block, 0, POSIX_FADV_DONTNEED); + posix_fadvise(fd, first_block, last_block, POSIX_FADV_SEQUENTIAL); + posix_fadvise(fd, first_block, last_block, POSIX_FADV_NOREUSE); + } else { + posix_fadvise(fd, first_block, 0, POSIX_FADV_SEQUENTIAL); + posix_fadvise(fd, first_block, 0, POSIX_FADV_NOREUSE); + } + /* Can skip holes? */ if (fstat(fd, &st)) eprintf("fstat %s:", path); @@ -352,7 +459,7 @@ main(int argc, char *argv[]) continue; eprintf("read %s:", path); } - position += (off_t)r; + remaining -= (off_t)r; } } position = first_block; @@ -394,11 +501,19 @@ main(int argc, char *argv[]) hole = last_block; /* Analyse the data */ - while (data < hole) { - off_t n_off = hole - data; - size_t n_size = bufsize - available; - size_t n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size); - ssize_t r = read(fd, &text[available], n); + for (;;) { + size_t n; + ssize_t r; + if (hole < 0) { + n = bufsize - available; + } else if (data < hole) { + off_t n_off = hole - data; + size_t n_size = bufsize - available; + n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size); + } else { + break; + } + r = read(fd, &text[available], n); if (r <= 0) { if (!r) goto eof; @@ -406,6 +521,7 @@ main(int argc, char *argv[]) continue; eprintf("read %s:", path); } + data += (off_t)r; available += (size_t)r; if (available == bufsize) available = encountered_data(text, available); @@ -430,5 +546,6 @@ out: free(automata); free(text); free(lower_text); + free(blkmarks); return 0; } |
