summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore15
-rw-r--r--LICENSE15
-rw-r--r--Makefile37
-rw-r--r--blkseekr.c297
-rw-r--r--config.mk8
5 files changed, 372 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..031be57
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,15 @@
+*\#*
+*~
+*.o
+*.a
+*.lo
+*.su
+*.so
+*.so.*
+*.dll
+*.dylib
+*.gch
+*.gcov
+*.gcno
+*.gcda
+/blkseekr
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6c7f02b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+© 2025, 2026 Mattias Andrée <m@maandree.se>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..99af225
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,37 @@
+.POSIX:
+
+CONFIGFILE = config.mk
+include $(CONFIGFILE)
+
+OBJ =\
+ blkseekr.o
+
+HDR =
+
+all: blkseekr
+$(OBJ): $(HDR)
+
+.c.o:
+ $(CC) -c -o $@ $< $(CFLAGS) $(CPPFLAGS)
+
+blkseekr: $(OBJ)
+ $(CC) -o $@ $(OBJ) $(LDFLAGS)
+
+install: blkseekr
+ mkdir -p -- "$(DESTDIR)$(PREFIX)/bin"
+ mkdir -p -- "$(DESTDIR)$(MANPREFIX)/man1/"
+ cp -- blkseekr "$(DESTDIR)$(PREFIX)/bin/"
+# TODO cp -- blkseekr.1 "$(DESTDIR)$(MANPREFIX)/man1/"
+
+uninstall:
+ -rm -f -- "$(DESTDIR)$(PREFIX)/bin/blkseekr"
+ -rm -f -- "$(DESTDIR)$(MANPREFIX)/man1/blkseekr.1"
+
+clean:
+ -rm -f -- *.o *.a *.lo *.su *.so *.so.* *.gch *.gcov *.gcno *.gcda
+ -rm -f -- blkseekr
+
+.SUFFIXES:
+.SUFFIXES: .o .c
+
+.PHONY: all install uninstall clean
diff --git a/blkseekr.c b/blkseekr.c
new file mode 100644
index 0000000..0f1c199
--- /dev/null
+++ b/blkseekr.c
@@ -0,0 +1,297 @@
+/* See LICENSE file for copyright and license details. */
+#include <libsimple-arg.h>
+#include <libsimple.h>
+#include <regex.h>
+
+USAGE("[-b block-size] [-h] ([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... "
+ "file [first-block [last-block]]");
+
+
+/*
+ * TODO on SIGINT stop and print last completed block
+ * TODO use madvise, posix_advise or readahead
+ * TODO allocate addition blocks to reduce memory copying
+ * TODO show progress
+ * TODO make program that extracts (binary) or displays (hex + text) the blocks
+ * TODO add support for NUL support in regex
+ */
+
+
+struct memstring {
+ const char *text;
+ size_t len;
+ int icase;
+};
+
+
+static struct memstring *texts = NULL;
+static size_t ntexts = 0;
+static regex_t *automata = NULL;
+static size_t nautomata = 0;
+static int analyse_holes = 0;
+static off_t position;
+
+
+static void
+analyse(char *block, size_t blksize, size_t available) /* TODO */
+{
+}
+
+
+static size_t
+encountered_data(char *block, size_t blksize, size_t available)
+{
+ if (available < blksize * 2u)
+ return available;
+
+ analyse(block, blksize, available);
+
+ memcpy(&block[0], &block[blksize], blksize);
+ position += (off_t)blksize;
+ return blksize;
+
+}
+
+
+static size_t
+encountered_hole(char *block, size_t blksize, size_t available, off_t size)
+{
+ off_t q = size / (off_t)blksize;
+ off_t r = size % (off_t)blksize;
+ size_t n;
+
+ if (q) {
+ q--;
+ r += (off_t)blksize;
+ }
+
+ while (r || q) {
+ n = blksize * 2u - available;
+ n = MIN(n, (size_t)r);
+ memset(&block[available], 0, n);
+ available += n;
+ r -= (off_t)n;
+
+ available = encountered_data(block, blksize, available);
+ if (q) {
+ q--;
+ memset(&block[available], 0, blksize);
+ available = encountered_data(block, blksize, available);
+ }
+ if (q) {
+ q--;
+ memset(&block[available], 0, blksize);
+ available = encountered_data(block, blksize, available);
+ }
+ if (analyse_holes) {
+ for (; q; q--)
+ available = encountered_data(block, blksize, available);
+ }
+ }
+
+ return available;
+}
+
+
+static void
+encountered_eof(char *block, size_t blksize, size_t available)
+{
+ analyse(block, blksize, available);
+}
+
+
+static void
+eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
+{
+ int e = regcomp(preg, regex, cflags);
+ if (e) {
+ char buf_static[512];
+ char *buf_dynamic = NULL;
+ char *buf = buf_static;
+ size_t buf_size = sizeof(buf_static);
+ size_t r;
+ regerror_again:
+ r = regerror(e, preg, buf, buf_size);
+ if (r > buf_size) {
+ buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2);
+ goto regerror_again;
+ }
+ eprintf("regcomp %s: %s", regex, buf);
+ free(buf_dynamic);
+ }
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ const char *path;
+ char *block;
+ size_t available = 0u;
+ int regex_flags = REG_NOSUB;
+ int have_unused_flags = 0;
+ size_t blksize = 4ul << 10;
+ off_t first_block = 0;
+ off_t last_block = -1;
+ off_t data, hole;
+ int skip_holes = 1;
+ int fd;
+ struct stat st;
+
+ ARGBEGIN {
+ case 'r':
+ case 'e':
+ have_unused_flags = 0;
+ automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata));
+ eregcomp(&automata[nautomata++], ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0));
+ break;
+
+ case 'x':
+ have_unused_flags = 0;
+ texts = ereallocarray(texts, ntexts + 1u, sizeof(*texts));
+ texts[ntexts].text = ARG();
+ texts[ntexts].len = strlen(texts[ntexts].text);
+ texts[ntexts].icase = !!(regex_flags & REG_ICASE);
+ ntexts++;
+ break;
+
+ case 'h':
+ analyse_holes = 1;
+ break;
+
+ case 'b':
+ /* TODO set `blksize` */
+ break;
+
+ case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break;
+ case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break;
+ default:
+ usage();
+ } ARGALT('+') {
+ case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break;
+ case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break;
+ default:
+ usage();
+ } ARGEND;
+
+ if (argc < 1 || argc > 3 || have_unused_flags || !nautomata)
+ usage();
+
+ path = argv[0];
+
+ if (argc >= 2)
+ ; /* TODO set first_block */
+
+ if (argc >= 3) {
+ ; /* TODO set last_block */
+ if (last_block < first_block)
+ usage();
+ }
+
+ block = emalloc(blksize * 2u);
+ first_block *= (off_t)blksize;
+ if (last_block >= 0) {
+ last_block += 1;
+ last_block *= (off_t)blksize;
+ }
+
+ /* Open file to analyse */
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ eprintf("open %s O_RDONLY", path);
+
+ /* Can skip holes? */
+ if (fstat(fd, &st))
+ eprintf("fstat %s:", path);
+ if (!S_ISREG(st.st_mode))
+ skip_holes = 0;
+
+ /* Skip to first block to analyse */
+ if (first_block && lseek(fd, first_block, SEEK_SET) == (off_t)-1) {
+ off_t remaining = first_block;
+ while (remaining) {
+ uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)(blksize * 2u));
+ ssize_t r = read(fd, block, (size_t)n);
+ if (r <= 0) {
+ if (!r)
+ goto out;
+ if (errno == EINTR)
+ continue;
+ eprintf("read %s:", path);
+ }
+ position += (off_t)r;
+ }
+ }
+ position = first_block;
+
+ /* Analyse to last block (to end of file if last_block < 0) */
+ for (; last_block < 0 || first_block < last_block; first_block = data) {
+ /* Seek to data to find end of hole, and analyse the hole */
+ if (skip_holes) {
+ data = lseek(fd, first_block, SEEK_DATA);
+ if (data == (off_t)-1) {
+ data = first_block;
+ if (errno != EBADF)
+ skip_holes = 0;
+ else
+ eprintf("lseek %s %ji SEEK_DATA", path, (intmax_t)first_block);
+ } else {
+ available = encountered_hole(block, blksize, available, data - first_block);
+ }
+ } else {
+ data = first_block;
+ }
+
+ /* Seek to hole to find end of data */
+ if (skip_holes) {
+ hole = lseek(fd, data, SEEK_HOLE);
+ if (hole == (off_t)-1) {
+ if (errno != EBADF)
+ skip_holes = 0;
+ else
+ eprintf("lseek %s %ji SEEK_HOLE", path, (intmax_t)data);
+ } else {
+ if (lseek(fd, data, SEEK_SET) == (off_t)-1)
+ eprintf("lseek %s %ji SEEK_SET", path, (intmax_t)data);
+ }
+ } else {
+ hole = (off_t)-1;
+ }
+ if (hole > last_block)
+ hole = last_block;
+
+ /* Analyse the data */
+ while (data < hole) {
+ off_t n_off = hole - data;
+ size_t n_size = blksize * 2u - available;
+ size_t n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size);
+ ssize_t r = read(fd, &block[available], n);
+ if (r <= 0) {
+ if (!r)
+ goto eof;
+ if (errno == EINTR)
+ continue;
+ eprintf("read %s:", path);
+ }
+ available += (size_t)r;
+ if (available == blksize * 2u)
+ available = encountered_data(block, blksize, available);
+ }
+ }
+eof:
+ /* Analyse end of file */
+ available = encountered_data(block, blksize, available);
+ if (skip_holes && first_block < st.st_size)
+ available = encountered_hole(block, blksize, available, st.st_size - first_block);
+ encountered_eof(block, blksize, available);
+
+out:
+ close(fd);
+
+ while (nautomata)
+ regfree(&automata[--nautomata]);
+ free(automata);
+ free(texts);
+ free(block);
+ return 0;
+}
diff --git a/config.mk b/config.mk
new file mode 100644
index 0000000..aeb8cec
--- /dev/null
+++ b/config.mk
@@ -0,0 +1,8 @@
+PREFIX = /usr
+MANPREFIX = $(PREFIX)/share/man
+
+CC = c99
+
+CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE
+CFLAGS =
+LDFLAGS = -lsimple