diff options
| author | Mattias Andrée <m@maandree.se> | 2026-06-04 19:05:04 +0200 |
|---|---|---|
| committer | Mattias Andrée <m@maandree.se> | 2026-06-04 19:05:04 +0200 |
| commit | 0a4371efcadf68f49a763964226eb8e8bd4d44ec (patch) | |
| tree | 3e2e2fd5b404df7bdc53877d0bee5bae61a2b0c2 /blkseekr.c | |
| download | blkseekr-0a4371efcadf68f49a763964226eb8e8bd4d44ec.tar.gz blkseekr-0a4371efcadf68f49a763964226eb8e8bd4d44ec.tar.bz2 blkseekr-0a4371efcadf68f49a763964226eb8e8bd4d44ec.tar.xz | |
First commit
Signed-off-by: Mattias Andrée <m@maandree.se>
Diffstat (limited to 'blkseekr.c')
| -rw-r--r-- | blkseekr.c | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/blkseekr.c b/blkseekr.c new file mode 100644 index 0000000..0f1c199 --- /dev/null +++ b/blkseekr.c @@ -0,0 +1,297 @@ +/* See LICENSE file for copyright and license details. */ +#include <libsimple-arg.h> +#include <libsimple.h> +#include <regex.h> + +USAGE("[-b block-size] [-h] ([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... " + "file [first-block [last-block]]"); + + +/* + * TODO on SIGINT stop and print last completed block + * TODO use madvise, posix_advise or readahead + * TODO allocate addition blocks to reduce memory copying + * TODO show progress + * TODO make program that extracts (binary) or displays (hex + text) the blocks + * TODO add support for NUL support in regex + */ + + +struct memstring { + const char *text; + size_t len; + int icase; +}; + + +static struct memstring *texts = NULL; +static size_t ntexts = 0; +static regex_t *automata = NULL; +static size_t nautomata = 0; +static int analyse_holes = 0; +static off_t position; + + +static void +analyse(char *block, size_t blksize, size_t available) /* TODO */ +{ +} + + +static size_t +encountered_data(char *block, size_t blksize, size_t available) +{ + if (available < blksize * 2u) + return available; + + analyse(block, blksize, available); + + memcpy(&block[0], &block[blksize], blksize); + position += (off_t)blksize; + return blksize; + +} + + +static size_t +encountered_hole(char *block, size_t blksize, size_t available, off_t size) +{ + off_t q = size / (off_t)blksize; + off_t r = size % (off_t)blksize; + size_t n; + + if (q) { + q--; + r += (off_t)blksize; + } + + while (r || q) { + n = blksize * 2u - available; + n = MIN(n, (size_t)r); + memset(&block[available], 0, n); + available += n; + r -= (off_t)n; + + available = encountered_data(block, blksize, available); + if (q) { + q--; + memset(&block[available], 0, blksize); + available = encountered_data(block, blksize, available); + } + if (q) { + q--; + memset(&block[available], 0, blksize); + available = encountered_data(block, blksize, available); + } + if (analyse_holes) { + for (; q; q--) + available = encountered_data(block, blksize, available); + } + } + + return available; +} + + +static void +encountered_eof(char *block, size_t blksize, size_t available) +{ + analyse(block, blksize, available); +} + + +static void +eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags) +{ + int e = regcomp(preg, regex, cflags); + if (e) { + char buf_static[512]; + char *buf_dynamic = NULL; + char *buf = buf_static; + size_t buf_size = sizeof(buf_static); + size_t r; + regerror_again: + r = regerror(e, preg, buf, buf_size); + if (r > buf_size) { + buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2); + goto regerror_again; + } + eprintf("regcomp %s: %s", regex, buf); + free(buf_dynamic); + } +} + + +int +main(int argc, char *argv[]) +{ + const char *path; + char *block; + size_t available = 0u; + int regex_flags = REG_NOSUB; + int have_unused_flags = 0; + size_t blksize = 4ul << 10; + off_t first_block = 0; + off_t last_block = -1; + off_t data, hole; + int skip_holes = 1; + int fd; + struct stat st; + + ARGBEGIN { + case 'r': + case 'e': + have_unused_flags = 0; + automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); + eregcomp(&automata[nautomata++], ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0)); + break; + + case 'x': + have_unused_flags = 0; + texts = ereallocarray(texts, ntexts + 1u, sizeof(*texts)); + texts[ntexts].text = ARG(); + texts[ntexts].len = strlen(texts[ntexts].text); + texts[ntexts].icase = !!(regex_flags & REG_ICASE); + ntexts++; + break; + + case 'h': + analyse_holes = 1; + break; + + case 'b': + /* TODO set `blksize` */ + break; + + case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break; + case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break; + default: + usage(); + } ARGALT('+') { + case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break; + case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break; + default: + usage(); + } ARGEND; + + if (argc < 1 || argc > 3 || have_unused_flags || !nautomata) + usage(); + + path = argv[0]; + + if (argc >= 2) + ; /* TODO set first_block */ + + if (argc >= 3) { + ; /* TODO set last_block */ + if (last_block < first_block) + usage(); + } + + block = emalloc(blksize * 2u); + first_block *= (off_t)blksize; + if (last_block >= 0) { + last_block += 1; + last_block *= (off_t)blksize; + } + + /* Open file to analyse */ + fd = open(path, O_RDONLY); + if (fd < 0) + eprintf("open %s O_RDONLY", path); + + /* Can skip holes? */ + if (fstat(fd, &st)) + eprintf("fstat %s:", path); + if (!S_ISREG(st.st_mode)) + skip_holes = 0; + + /* Skip to first block to analyse */ + if (first_block && lseek(fd, first_block, SEEK_SET) == (off_t)-1) { + off_t remaining = first_block; + while (remaining) { + uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)(blksize * 2u)); + ssize_t r = read(fd, block, (size_t)n); + if (r <= 0) { + if (!r) + goto out; + if (errno == EINTR) + continue; + eprintf("read %s:", path); + } + position += (off_t)r; + } + } + position = first_block; + + /* Analyse to last block (to end of file if last_block < 0) */ + for (; last_block < 0 || first_block < last_block; first_block = data) { + /* Seek to data to find end of hole, and analyse the hole */ + if (skip_holes) { + data = lseek(fd, first_block, SEEK_DATA); + if (data == (off_t)-1) { + data = first_block; + if (errno != EBADF) + skip_holes = 0; + else + eprintf("lseek %s %ji SEEK_DATA", path, (intmax_t)first_block); + } else { + available = encountered_hole(block, blksize, available, data - first_block); + } + } else { + data = first_block; + } + + /* Seek to hole to find end of data */ + if (skip_holes) { + hole = lseek(fd, data, SEEK_HOLE); + if (hole == (off_t)-1) { + if (errno != EBADF) + skip_holes = 0; + else + eprintf("lseek %s %ji SEEK_HOLE", path, (intmax_t)data); + } else { + if (lseek(fd, data, SEEK_SET) == (off_t)-1) + eprintf("lseek %s %ji SEEK_SET", path, (intmax_t)data); + } + } else { + hole = (off_t)-1; + } + if (hole > last_block) + hole = last_block; + + /* Analyse the data */ + while (data < hole) { + off_t n_off = hole - data; + size_t n_size = blksize * 2u - available; + size_t n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size); + ssize_t r = read(fd, &block[available], n); + if (r <= 0) { + if (!r) + goto eof; + if (errno == EINTR) + continue; + eprintf("read %s:", path); + } + available += (size_t)r; + if (available == blksize * 2u) + available = encountered_data(block, blksize, available); + } + } +eof: + /* Analyse end of file */ + available = encountered_data(block, blksize, available); + if (skip_holes && first_block < st.st_size) + available = encountered_hole(block, blksize, available, st.st_size - first_block); + encountered_eof(block, blksize, available); + +out: + close(fd); + + while (nautomata) + regfree(&automata[--nautomata]); + free(automata); + free(texts); + free(block); + return 0; +} |
