summaryrefslogtreecommitdiffstats
path: root/blkseekr.c
diff options
context:
space:
mode:
authorMattias Andrée <m@maandree.se>2026-06-04 22:42:16 +0200
committerMattias Andrée <m@maandree.se>2026-06-04 22:42:16 +0200
commit19cc5d523e5a9feeab5a36ce779ddbfda0717b5a (patch)
tree72c3b1904fd1626f802d7c8c93f8cfa2eff4fcae /blkseekr.c
parentSecond commit (diff)
downloadblkseekr-19cc5d523e5a9feeab5a36ce779ddbfda0717b5a.tar.gz
blkseekr-19cc5d523e5a9feeab5a36ce779ddbfda0717b5a.tar.bz2
blkseekr-19cc5d523e5a9feeab5a36ce779ddbfda0717b5a.tar.xz
Third commit
Signed-off-by: Mattias Andrée <m@maandree.se>
Diffstat (limited to 'blkseekr.c')
-rw-r--r--blkseekr.c221
1 files changed, 169 insertions, 52 deletions
diff --git a/blkseekr.c b/blkseekr.c
index 8e42289..aa55f0b 100644
--- a/blkseekr.c
+++ b/blkseekr.c
@@ -5,17 +5,15 @@
#include <libautomata.h>
USAGE("[-b block-size] [-B buffer-size] [-h] "
- "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... "
+ "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text | -X hextext)) ... "
"file [first-block [last-block]]");
/*
* TODO on SIGINT stop and print last completed block
- * TODO use madvise, posix_advise or readahead (?)
* TODO show progress
* TODO make program that extracts (binary) or displays (hex + text) the blocks
- * TODO add support for NUL support in regex
- * TODO add support for hexadecimally encoded text (-X)
+ * TODO add support for NUL support in regex (once supported in libautomata)
*/
@@ -47,13 +45,21 @@ static size_t blksize = 4ul << 10;
static size_t bufsize = 8ul << 20;
static char *lower_text = NULL;
static size_t lower_text_size = 0u;
+static char *blkmarks;
+static size_t nblkmarks;
+static size_t low_mark;
+static size_t high_mark;
static void
-report(off_t beginning)
+mark(off_t beginning)
{
- printf("Found expression beginning at byte %ju\n", (uintmax_t)beginning);
- fflush(stdout);
+ size_t mark = (size_t)(beginning - position) / blksize;
+ blkmarks[mark] = 1;
+ if (low_mark > mark)
+ low_mark = mark;
+ if (high_mark < mark)
+ high_mark = mark;
}
@@ -71,7 +77,7 @@ analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct
break;
beginning = end - automaton->len;
offset = (size_t)(beginning - block);
- report(position + (off_t)offset);
+ mark(position + (off_t)offset);
offset += 1u;
}
}
@@ -80,40 +86,68 @@ analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct
static void
analyse_block_with_regex(char *block, size_t start_end, size_t available, regex_t *automaton)
{
- /* TODO */
+ size_t offset = 0u;
+ regmatch_t match;
+ int r;
+
+ while (offset < start_end) {
+ match.rm_so = (regoff_t)offset;
+ match.rm_eo = (regoff_t)available;
+ r = regexec(automaton, block, 1u, &match, REG_STARTEND);
+ if (r == REG_NOMATCH)
+ break;
+ if (r)
+ eprintf("regexec:");
+ if ((size_t)match.rm_so >= start_end)
+ break;
+ mark(position + (off_t)match.rm_so);
+ offset = (size_t)match.rm_so + 1u;
+ }
}
static void
analyse_block(char *block, size_t start_end, size_t available)
{
- size_t i, j;
+ size_t i = 0u, off = 0u, blkoff = 0u;
int lowered = 0;
- for (i = 0u; i < nautomata; i++) {
+
+ memset(blkmarks, 0, nblkmarks);
+ low_mark = SIZE_MAX;
+ high_mark = 0u;
+
+ for (;;) {
switch (automata[i].type) {
case FIXED_ICASE:
if (!lowered) {
lowered = 1;
- if (available > lower_text_size)
- lower_text = erealloc(lower_text, lower_text_size = available);
- for (j = 0u; j < available; j++) {
- if ('A' <= block[i] && block[i] <= 'Z')
- lower_text[i] = (char)(block[i] ^ ('a' ^ 'A'));
- else
- lower_text[i] = block[i];
- }
+ if (available - off > lower_text_size)
+ lower_text = erealloc(lower_text, lower_text_size = available - off);
+ libsimple_memtolower(&lower_text[off], &block[off], available - off);
}
- analyse_block_with_fixed(lower_text, start_end, available, &automata[i].a.fixed);
+ analyse_block_with_fixed(&lower_text[off], start_end - off, available - off, &automata[i].a.fixed);
break;
case FIXED:
- analyse_block_with_fixed(block, start_end, available, &automata[i].a.fixed);
+ analyse_block_with_fixed(&block[off], start_end - off, available - off, &automata[i].a.fixed);
break;
case REGEX:
- analyse_block_with_regex(block, start_end, available, &automata[i].a.regex);
+ analyse_block_with_regex(&block[off], start_end - off, available - off, &automata[i].a.regex);
break;
default:
abort();
}
+
+ if (++i == nautomata)
+ break;
+
+ if (blkoff == low_mark)
+ for (; blkoff <= high_mark && blkmarks[blkoff]; blkoff++)
+ off += blksize;
+ }
+
+ for (i = 0u; i < nblkmarks; i++) {
+ printf("Found expression beginning in block %ju\n", (uintmax_t)position / blksize + i);
+ fflush(stdout);
}
}
@@ -146,11 +180,11 @@ encountered_hole(char *text, size_t available, off_t size)
* be skipped the processing requires two blocks) */
if (available >= blksize * 2u) {
size_t full_avail = available - (available % blksize);
- size_t start_end = available - blksize;
+ size_t start_end = full_avail - blksize;
analyse_block(text, start_end, full_avail);
position += (off_t)start_end;
available -= start_end;
- memcpy(&text[0], &text[start_end], available - start_end);
+ memcpy(&text[0], &text[start_end], available);
}
/* If there is data buffered remaining, fill to two
@@ -186,7 +220,7 @@ encountered_hole(char *text, size_t available, off_t size)
while (size >= (off_t)(blksize * 2u)) {
size_t max = (size_t)MIN((uintmax_t)size, (uintmax_t)bufsize);
size_t full_avail = max - (max % blksize);
- size_t start_end = max - blksize;
+ size_t start_end = full_avail - blksize;
analyse_block(text, start_end, full_avail);
position += (off_t)start_end;
size -= (off_t)start_end;
@@ -225,6 +259,50 @@ eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
}
+#if defined(__GNUC__)
+__attribute__((__pure__))
+#endif
+static uintmax_t
+parse_uint_arg(const char *s, uintmax_t min, uintmax_t max)
+{
+ uintmax_t v = 0u, d;
+ if (!*s)
+ usage();
+ while ('0' <= *s && *s <= '9') {
+ d = (uintmax_t)(*s++ - '0');
+ if (d > 9u || v > (max - d) / 10u) /* we know that max>d */
+ usage();
+ v = v * 10u + v;
+ }
+ if (*s || v < min)
+ usage();
+ return v;
+}
+
+
+static void
+decode_hex(char *bin, const char *hex, size_t n)
+{
+ size_t i, j, end;
+ uint8_t value;
+ for (i = 0u, j = 0u; i < n; i += 1u) {
+ value = 0u;
+ for (end = j + 2u; j < end; j++) {
+ value <<= 4;
+ if ('0' <= hex[j] && hex[j] <= '9')
+ value |= (uint8_t)(hex[j] - '0');
+ else if ('a' <= hex[j] && hex[j] <= 'f')
+ value |= (uint8_t)(hex[j] - 'a' + 10);
+ else if ('A' <= hex[j] && hex[j] <= 'F')
+ value |= (uint8_t)(hex[j] - 'A' + 10);
+ else
+ usage();
+ }
+ bin[i] = (char)value;
+ }
+}
+
+
int
main(int argc, char *argv[])
{
@@ -232,7 +310,7 @@ main(int argc, char *argv[])
const char *pattern;
char *pattern_free;
char *text;
- size_t available = 0u;
+ size_t len, available = 0u;
int regex_flags = REG_NOSUB;
int have_unused_flags = 0;
off_t first_block = 0;
@@ -241,7 +319,6 @@ main(int argc, char *argv[])
int skip_holes = 1;
int fd;
struct stat st;
- size_t i;
ARGBEGIN {
case 'r':
@@ -257,19 +334,42 @@ main(int argc, char *argv[])
have_unused_flags = 0;
automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata));
pattern = ARG();
+ automata[nautomata].a.fixed.len = len = strlen(pattern);
+ if (!len)
+ usage();
if (regex_flags & REG_ICASE) {
automata[nautomata].type = FIXED_ICASE;
pattern_free = estrdup(pattern);
- for (i = 0u; pattern_free[i]; i++)
- if ('A' <= pattern_free[i] && pattern_free[i] <= 'Z')
- pattern_free[i] ^= (char)('a' ^ 'A');
+ libsimple_memtolower(pattern_free, pattern_free, len);
pattern = pattern_free;
} else {
automata[nautomata].type = FIXED;
pattern_free = NULL;
}
- automata[nautomata].a.fixed.len = strlen(pattern);
- automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, strlen(pattern), sizeof(char));
+ automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, len, sizeof(char));
+ if (!automata[nautomata].a.fixed.kmp)
+ eprintf("libautomata_compile_kmp_automaton:");
+ nautomata++;
+ free(pattern_free);
+ break;
+
+ case 'X':
+ have_unused_flags = 0;
+ automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata));
+ pattern = ARG();
+ len = strlen(pattern);
+ if (!len || (len & 1u))
+ usage();
+ automata[nautomata].a.fixed.len = len /= 2u;
+ pattern_free = emalloc(len);
+ decode_hex(pattern_free, pattern, len);
+ if (regex_flags & REG_ICASE) {
+ automata[nautomata].type = FIXED_ICASE;
+ libsimple_memtolower(pattern_free, pattern_free, len);
+ } else {
+ automata[nautomata].type = FIXED;
+ }
+ automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern_free, len, sizeof(char));
if (!automata[nautomata].a.fixed.kmp)
eprintf("libautomata_compile_kmp_automaton:");
nautomata++;
@@ -281,11 +381,11 @@ main(int argc, char *argv[])
break;
case 'b':
- /* TODO set blksize */
+ blksize = (size_t)parse_uint_arg(ARG(), 1u, SIZE_MAX);
break;
case 'B':
- /* TODO set bufsize */
+ bufsize = (size_t)parse_uint_arg(ARG(), 1u, SIZE_MAX);
break;
case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break;
@@ -304,35 +404,42 @@ main(int argc, char *argv[])
path = argv[0];
- if (argc >= 2) {
- /* TODO set first_block */
- }
-
- if (argc >= 3) {
- /* TODO set last_block */
- if (last_block < first_block)
- usage();
- }
+ if (argc >= 2)
+ first_block = (off_t)parse_uint_arg(argv[1], 1u, (uintmax_t)OFF_MAX);
+ if (argc >= 3)
+ last_block = (off_t)parse_uint_arg(argv[2], (uintmax_t)first_block, (uintmax_t)OFF_MAX);
if (blksize > (size_t)(SSIZE_MAX / 4))
eprintf("selected blocksize is too large");
if (bufsize > (size_t)SSIZE_MAX)
bufsize = (size_t)SSIZE_MAX;
- bufsize = bufsize / blksize;
- bufsize = MAX(bufsize, 2);
- bufsize *= blksize;
+ nblkmarks = bufsize / blksize;
+ nblkmarks = MAX(nblkmarks, 2);
+ bufsize = nblkmarks * blksize;
text = emalloc(bufsize);
first_block *= (off_t)blksize;
if (last_block >= 0) {
last_block += 1;
last_block *= (off_t)blksize;
}
+ blkmarks = emalloc(nblkmarks);
/* Open file to analyse */
fd = open(path, O_RDONLY);
if (fd < 0)
eprintf("open %s O_RDONLY", path);
+ /* Advise the kernel about the access pattern */
+ posix_fadvise(fd, 0, first_block, POSIX_FADV_DONTNEED);
+ if (last_block >= 0) {
+ posix_fadvise(fd, last_block, 0, POSIX_FADV_DONTNEED);
+ posix_fadvise(fd, first_block, last_block, POSIX_FADV_SEQUENTIAL);
+ posix_fadvise(fd, first_block, last_block, POSIX_FADV_NOREUSE);
+ } else {
+ posix_fadvise(fd, first_block, 0, POSIX_FADV_SEQUENTIAL);
+ posix_fadvise(fd, first_block, 0, POSIX_FADV_NOREUSE);
+ }
+
/* Can skip holes? */
if (fstat(fd, &st))
eprintf("fstat %s:", path);
@@ -352,7 +459,7 @@ main(int argc, char *argv[])
continue;
eprintf("read %s:", path);
}
- position += (off_t)r;
+ remaining -= (off_t)r;
}
}
position = first_block;
@@ -394,11 +501,19 @@ main(int argc, char *argv[])
hole = last_block;
/* Analyse the data */
- while (data < hole) {
- off_t n_off = hole - data;
- size_t n_size = bufsize - available;
- size_t n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size);
- ssize_t r = read(fd, &text[available], n);
+ for (;;) {
+ size_t n;
+ ssize_t r;
+ if (hole < 0) {
+ n = bufsize - available;
+ } else if (data < hole) {
+ off_t n_off = hole - data;
+ size_t n_size = bufsize - available;
+ n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size);
+ } else {
+ break;
+ }
+ r = read(fd, &text[available], n);
if (r <= 0) {
if (!r)
goto eof;
@@ -406,6 +521,7 @@ main(int argc, char *argv[])
continue;
eprintf("read %s:", path);
}
+ data += (off_t)r;
available += (size_t)r;
if (available == bufsize)
available = encountered_data(text, available);
@@ -430,5 +546,6 @@ out:
free(automata);
free(text);
free(lower_text);
+ free(blkmarks);
return 0;
}