/* See LICENSE file for copyright and license details. */ #include #include #include #include USAGE("[-b block-size] [-B buffer-size] [-h] " "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text)) ... " "file [first-block [last-block]]"); /* * TODO on SIGINT stop and print last completed block * TODO use madvise, posix_advise or readahead (?) * TODO show progress * TODO make program that extracts (binary) or displays (hex + text) the blocks * TODO add support for NUL support in regex * TODO add support for hexadecimally encoded text (-X) */ enum automata_type { FIXED, FIXED_ICASE, REGEX }; struct fixed { LIBAUTOMATA_KMP_AUTOMATON *kmp; size_t len; }; struct automata { enum automata_type type; union { struct fixed fixed; regex_t regex; } a; }; static struct automata *automata = NULL; static size_t nautomata = 0u; static int analyse_holes = 0; static off_t position; static size_t blksize = 4ul << 10; static size_t bufsize = 8ul << 20; static char *lower_text = NULL; static size_t lower_text_size = 0u; static void report(off_t beginning) { printf("Found expression beginning at byte %ju\n", (uintmax_t)beginning); fflush(stdout); } static void analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct fixed *automaton) { size_t extra, offset = 0; char *end, *beginning; extra = MIN(available - start_end, automaton->len); while (offset < start_end) { end = libautomata_execute_kmp_automaton(automaton->kmp, &block[offset], start_end - offset + extra); if (!end) break; beginning = end - automaton->len; offset = (size_t)(beginning - block); report(position + (off_t)offset); offset += 1u; } } static void analyse_block_with_regex(char *block, size_t start_end, size_t available, regex_t *automaton) { /* TODO */ } static void analyse_block(char *block, size_t start_end, size_t available) { size_t i, j; int lowered = 0; for (i = 0u; i < nautomata; i++) { switch (automata[i].type) { case FIXED_ICASE: if (!lowered) { lowered = 1; if (available > lower_text_size) lower_text = erealloc(lower_text, lower_text_size = available); for (j = 0u; j < available; j++) { if ('A' <= block[i] && block[i] <= 'Z') lower_text[i] = (char)(block[i] ^ ('a' ^ 'A')); else lower_text[i] = block[i]; } } analyse_block_with_fixed(lower_text, start_end, available, &automata[i].a.fixed); break; case FIXED: analyse_block_with_fixed(block, start_end, available, &automata[i].a.fixed); break; case REGEX: analyse_block_with_regex(block, start_end, available, &automata[i].a.regex); break; default: abort(); } } } static size_t encountered_data(char *text, size_t available) { /* Wait until the buffer is full */ if (available < bufsize) return available; /* Process the buffer */ analyse_block(text, bufsize - blksize, available); position += (off_t)(bufsize - blksize); /* Save last block to support patterns spanning two blocks */ memcpy(&text[0], &text[bufsize - blksize], blksize); return blksize; } static size_t encountered_hole(char *text, size_t available, off_t size) { off_t r; /* If there are at least two blocks of data buffered, * process all but the last one (the last one have to * be skipped the processing requires two blocks) */ if (available >= blksize * 2u) { size_t full_avail = available - (available % blksize); size_t start_end = available - blksize; analyse_block(text, start_end, full_avail); position += (off_t)start_end; available -= start_end; memcpy(&text[0], &text[start_end], available - start_end); } /* If there is data buffered remaining, fill to two * blocks with null bytes and analyse that data */ if (available) { size_t n = blksize * 2u - available; if ((uintmax_t)n > (uintmax_t)size) n = (size_t)size; memset(&text[available], 0, n); available += n; if (available < blksize * 2u) return available; analyse_block(text, available - blksize, available); position += (off_t)(available - blksize); size -= (off_t)n; available = blksize; } memset(text, 0, bufsize); size += (off_t)available; /* If we are not analyzing holes, skip pass the full * blocks and the full the buffer with as many null * bytes as there are bytes in the hole in excess of * full blocks */ if (!analyse_holes) { r = size % (off_t)blksize; position += size - r; return (size_t)r; } /* But if we are analysing holes... */ while (size >= (off_t)(blksize * 2u)) { size_t max = (size_t)MIN((uintmax_t)size, (uintmax_t)bufsize); size_t full_avail = max - (max % blksize); size_t start_end = max - blksize; analyse_block(text, start_end, full_avail); position += (off_t)start_end; size -= (off_t)start_end; } return (size_t)size; } static void encountered_eof(char *text, size_t available) { analyse_block(text, available, available); } static void eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags) { int e = regcomp(preg, regex, cflags); if (e) { char buf_static[512]; char *buf_dynamic = NULL; char *buf = buf_static; size_t buf_size = sizeof(buf_static); size_t r; regerror_again: r = regerror(e, preg, buf, buf_size); if (r > buf_size) { buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2u); goto regerror_again; } eprintf("regcomp %s: %s", regex, buf); free(buf_dynamic); } } int main(int argc, char *argv[]) { const char *path; const char *pattern; char *pattern_free; char *text; size_t available = 0u; int regex_flags = REG_NOSUB; int have_unused_flags = 0; off_t first_block = 0; off_t last_block = -1; off_t data, hole; int skip_holes = 1; int fd; struct stat st; size_t i; ARGBEGIN { case 'r': case 'e': have_unused_flags = 0; automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); eregcomp(&automata[nautomata].a.regex, ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0)); automata[nautomata].type = REGEX; nautomata++; break; case 'x': have_unused_flags = 0; automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); pattern = ARG(); if (regex_flags & REG_ICASE) { automata[nautomata].type = FIXED_ICASE; pattern_free = estrdup(pattern); for (i = 0u; pattern_free[i]; i++) if ('A' <= pattern_free[i] && pattern_free[i] <= 'Z') pattern_free[i] ^= (char)('a' ^ 'A'); pattern = pattern_free; } else { automata[nautomata].type = FIXED; pattern_free = NULL; } automata[nautomata].a.fixed.len = strlen(pattern); automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, strlen(pattern), sizeof(char)); if (!automata[nautomata].a.fixed.kmp) eprintf("libautomata_compile_kmp_automaton:"); nautomata++; free(pattern_free); break; case 'h': analyse_holes = 1; break; case 'b': /* TODO set blksize */ break; case 'B': /* TODO set bufsize */ break; case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break; case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break; default: usage(); } ARGALT('+') { case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break; case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break; default: usage(); } ARGEND; if (argc < 1 || argc > 3 || have_unused_flags || !nautomata) usage(); path = argv[0]; if (argc >= 2) { /* TODO set first_block */ } if (argc >= 3) { /* TODO set last_block */ if (last_block < first_block) usage(); } if (blksize > (size_t)(SSIZE_MAX / 4)) eprintf("selected blocksize is too large"); if (bufsize > (size_t)SSIZE_MAX) bufsize = (size_t)SSIZE_MAX; bufsize = bufsize / blksize; bufsize = MAX(bufsize, 2); bufsize *= blksize; text = emalloc(bufsize); first_block *= (off_t)blksize; if (last_block >= 0) { last_block += 1; last_block *= (off_t)blksize; } /* Open file to analyse */ fd = open(path, O_RDONLY); if (fd < 0) eprintf("open %s O_RDONLY", path); /* Can skip holes? */ if (fstat(fd, &st)) eprintf("fstat %s:", path); if (!S_ISREG(st.st_mode)) skip_holes = 0; /* Skip to first block to analyse */ if (first_block && lseek(fd, first_block, SEEK_SET) == (off_t)-1) { off_t remaining = first_block; while (remaining) { uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)bufsize); ssize_t r = read(fd, text, (size_t)n); if (r <= 0) { if (!r) goto out; if (errno == EINTR) continue; eprintf("read %s:", path); } position += (off_t)r; } } position = first_block; /* Analyse to last block (to end of file if last_block < 0) */ for (; last_block < 0 || first_block < last_block; first_block = data) { /* Seek to data to find end of hole, and analyse the hole */ if (skip_holes) { data = lseek(fd, first_block, SEEK_DATA); if (data == (off_t)-1) { data = first_block; if (errno != EBADF) skip_holes = 0; else eprintf("lseek %s %ji SEEK_DATA", path, (intmax_t)first_block); } else { available = encountered_hole(text, available, data - first_block); } } else { data = first_block; } /* Seek to hole to find end of data */ if (skip_holes) { hole = lseek(fd, data, SEEK_HOLE); if (hole == (off_t)-1) { if (errno != EBADF) skip_holes = 0; else eprintf("lseek %s %ji SEEK_HOLE", path, (intmax_t)data); } else { if (lseek(fd, data, SEEK_SET) == (off_t)-1) eprintf("lseek %s %ji SEEK_SET", path, (intmax_t)data); } } else { hole = (off_t)-1; } if (hole > last_block) hole = last_block; /* Analyse the data */ while (data < hole) { off_t n_off = hole - data; size_t n_size = bufsize - available; size_t n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size); ssize_t r = read(fd, &text[available], n); if (r <= 0) { if (!r) goto eof; if (errno == EINTR) continue; eprintf("read %s:", path); } available += (size_t)r; if (available == bufsize) available = encountered_data(text, available); } } eof: /* Analyse end of file */ available = encountered_data(text, available); if (skip_holes && first_block < st.st_size) available = encountered_hole(text, available, st.st_size - first_block); encountered_eof(text, available); out: close(fd); while (nautomata--) { if (automata[nautomata].type == REGEX) regfree(&automata[nautomata].a.regex); else free(automata[nautomata].a.fixed.kmp); } free(automata); free(text); free(lower_text); return 0; }