/* See LICENSE file for copyright and license details. */ #include #include #include #include USAGE("[-b block-size] [-B buffer-size] [-h] " "([-i | +i | -n | +n] ... (-r basic-regex | -e extended-regex | -x text | -X hextext)) ... " "file [first-block [last-block]]"); enum automata_type { FIXED, FIXED_ICASE, REGEX }; struct fixed { LIBAUTOMATA_KMP_AUTOMATON *kmp; size_t len; }; struct automata { enum automata_type type; union { struct fixed fixed; regex_t regex; } a; }; static struct automata *automata = NULL; static size_t nautomata = 0u; static int analyse_holes = 0; static off_t position; static size_t blksize = 4ul << 10; static size_t bufsize = 8ul << 20; static char *lower_text = NULL; static size_t lower_text_size = 0u; static char *blkmarks; static size_t nblkmarks; static size_t low_mark; static size_t high_mark; static int stdout_is_a_tty; static int stderr_is_a_tty; static void mark(off_t beginning) { size_t mark = (size_t)(beginning - position) / blksize; blkmarks[mark] = 1; if (low_mark > mark) low_mark = mark; if (high_mark < mark) high_mark = mark; } static void analyse_block_with_fixed(char *block, size_t start_end, size_t available, struct fixed *automaton) { size_t extra, offset = 0; char *end, *beginning; extra = MIN(available - start_end, automaton->len); while (offset < start_end) { end = libautomata_execute_kmp_automaton(automaton->kmp, &block[offset], start_end - offset + extra); if (!end) break; beginning = end - automaton->len; offset = (size_t)(beginning - block); mark(position + (off_t)offset); offset += 1u; } } static void analyse_block_with_regex(char *block, size_t start_end, size_t available, regex_t *automaton) { size_t offset = 0u; regmatch_t match; int r; while (offset < start_end) { match.rm_so = (regoff_t)offset; match.rm_eo = (regoff_t)available; r = regexec(automaton, block, 1u, &match, REG_STARTEND); if (r == REG_NOMATCH) break; if (r) eprintf("regexec:"); if ((size_t)match.rm_so >= start_end) break; mark(position + (off_t)match.rm_so); offset = (size_t)match.rm_so + 1u; } } static void analyse_block(char *block, size_t start_end, size_t available) { size_t i = 0u, off = 0u, blkoff = 0u; int lowered = 0; memset(blkmarks, 0, nblkmarks); low_mark = SIZE_MAX; high_mark = 0u; for (;;) { switch (automata[i].type) { case FIXED_ICASE: if (!lowered) { lowered = 1; if (available - off > lower_text_size) lower_text = erealloc(lower_text, lower_text_size = available - off); libsimple_memtolower(&lower_text[off], &block[off], available - off); } analyse_block_with_fixed(&lower_text[off], start_end - off, available - off, &automata[i].a.fixed); break; case FIXED: analyse_block_with_fixed(&block[off], start_end - off, available - off, &automata[i].a.fixed); break; case REGEX: analyse_block_with_regex(&block[off], start_end - off, available - off, &automata[i].a.regex); break; default: abort(); } if (++i == nautomata) break; if (blkoff == low_mark) for (; blkoff <= high_mark && blkmarks[blkoff]; blkoff++) off += blksize; } for (i = 0u; i < nblkmarks; i++) { if (blkmarks[i]) { printf("%ju%s\n", (uintmax_t)position / blksize + i, stdout_is_a_tty ? "\033[K" : ""); fflush(stdout); } } } static size_t encountered_data(char *text, size_t available) { /* Wait until the buffer is full */ if (available < bufsize) return available; /* Process the buffer */ analyse_block(text, bufsize - blksize, available); position += (off_t)(bufsize - blksize); /* Save last block to support patterns spanning two blocks */ memcpy(&text[0], &text[bufsize - blksize], blksize); return blksize; } static size_t encountered_hole(char *text, size_t available, off_t size) { off_t r; /* If there are at least two blocks of data buffered, * process all but the last one (the last one have to * be skipped the processing requires two blocks) */ if (available >= blksize * 2u) { size_t full_avail = available - (available % blksize); size_t start_end = full_avail - blksize; analyse_block(text, start_end, full_avail); position += (off_t)start_end; available -= start_end; memcpy(&text[0], &text[start_end], available); } /* If there is data buffered remaining, fill to two * blocks with null bytes and analyse that data */ if (available) { size_t n = blksize * 2u - available; if ((uintmax_t)n > (uintmax_t)size) n = (size_t)size; memset(&text[available], 0, n); available += n; if (available < blksize * 2u) return available; analyse_block(text, available - blksize, available); position += (off_t)(available - blksize); size -= (off_t)n; available = blksize; } memset(text, 0, bufsize); size += (off_t)available; /* If we are not analyzing holes, skip pass the full * blocks and the full the buffer with as many null * bytes as there are bytes in the hole in excess of * full blocks */ if (!analyse_holes) { r = size % (off_t)blksize; position += size - r; return (size_t)r; } /* But if we are analysing holes... */ while (size >= (off_t)(blksize * 2u)) { size_t max = (size_t)MIN((uintmax_t)size, (uintmax_t)bufsize); size_t full_avail = max - (max % blksize); size_t start_end = full_avail - blksize; analyse_block(text, start_end, full_avail); position += (off_t)start_end; size -= (off_t)start_end; } return (size_t)size; } static void encountered_eof(char *text, size_t available) { analyse_block(text, available, available); } static void eregcomp(regex_t *restrict preg, const char *restrict regex, int cflags) { int e = regcomp(preg, regex, cflags); if (e) { char buf_static[512]; char *buf_dynamic = NULL; char *buf = buf_static; size_t buf_size = sizeof(buf_static); size_t r; regerror_again: r = regerror(e, preg, buf, buf_size); if (r > buf_size) { buf = buf_dynamic = erealloc(buf_dynamic, buf_size *= 2u); goto regerror_again; } eprintf("regcomp %s: %s", regex, buf); free(buf_dynamic); } } #if defined(__GNUC__) __attribute__((__pure__)) #endif static uintmax_t parse_uint_arg(const char *s, uintmax_t min, uintmax_t max) { uintmax_t v = 0u, d; if (!*s) usage(); while ('0' <= *s && *s <= '9') { d = (uintmax_t)(*s++ - '0'); if (d > 9u || v > (max - d) / 10u) /* we know that max>d */ usage(); v = v * 10u + d; } if (*s || v < min) usage(); return v; } static void decode_hex(char *bin, const char *hex, size_t n) { size_t i, j, end; uint8_t value; for (i = 0u, j = 0u; i < n; i += 1u) { value = 0u; for (end = j + 2u; j < end; j++) { value <<= 4; if ('0' <= hex[j] && hex[j] <= '9') value |= (uint8_t)(hex[j] - '0'); else if ('a' <= hex[j] && hex[j] <= 'f') value |= (uint8_t)(hex[j] - 'a' + 10); else if ('A' <= hex[j] && hex[j] <= 'F') value |= (uint8_t)(hex[j] - 'A' + 10); else usage(); } bin[i] = (char)value; } } int main(int argc, char *argv[]) { const char *path; const char *pattern; char *pattern_free; char *text; size_t len, available = 0u; int regex_flags = REG_NOSUB; int have_unused_flags = 0; off_t first_block = 0; off_t last_block = -1; off_t data, hole; int skip_holes = 1; int fd; struct stat st; ARGBEGIN { case 'r': case 'e': have_unused_flags = 0; automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); eregcomp(&automata[nautomata].a.regex, ARG(), regex_flags | (FLAG() == 'e' ? REG_EXTENDED : 0)); automata[nautomata].type = REGEX; nautomata++; break; case 'x': have_unused_flags = 0; automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); pattern = ARG(); automata[nautomata].a.fixed.len = len = strlen(pattern); if (!len) usage(); if (regex_flags & REG_ICASE) { automata[nautomata].type = FIXED_ICASE; pattern_free = estrdup(pattern); libsimple_memtolower(pattern_free, pattern_free, len); pattern = pattern_free; } else { automata[nautomata].type = FIXED; pattern_free = NULL; } automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern, len, sizeof(char)); if (!automata[nautomata].a.fixed.kmp) eprintf("libautomata_compile_kmp_automaton:"); nautomata++; free(pattern_free); break; case 'X': have_unused_flags = 0; automata = ereallocarray(automata, nautomata + 1u, sizeof(*automata)); pattern = ARG(); len = strlen(pattern); if (!len || (len & 1u)) usage(); automata[nautomata].a.fixed.len = len /= 2u; pattern_free = emalloc(len); decode_hex(pattern_free, pattern, len); if (regex_flags & REG_ICASE) { automata[nautomata].type = FIXED_ICASE; libsimple_memtolower(pattern_free, pattern_free, len); } else { automata[nautomata].type = FIXED; } automata[nautomata].a.fixed.kmp = libautomata_compile_kmp_automaton(pattern_free, len, sizeof(char)); if (!automata[nautomata].a.fixed.kmp) eprintf("libautomata_compile_kmp_automaton:"); nautomata++; free(pattern_free); break; case 'h': analyse_holes = 1; break; case 'b': blksize = (size_t)parse_uint_arg(ARG(), 1u, SIZE_MAX); break; case 'B': bufsize = (size_t)parse_uint_arg(ARG(), 1u, SIZE_MAX); break; case 'i': regex_flags |= REG_ICASE; have_unused_flags = 1; break; case 'n': regex_flags |= REG_NEWLINE; have_unused_flags = 1; break; default: usage(); } ARGALT('+') { case 'i': regex_flags &= ~REG_ICASE; have_unused_flags = 1; break; case 'n': regex_flags &= ~REG_NEWLINE; have_unused_flags = 1; break; default: usage(); } ARGEND; if (argc < 1 || argc > 3 || have_unused_flags || !nautomata) usage(); stderr_is_a_tty = isatty(STDERR_FILENO); if (!stderr_is_a_tty && errno == EBADF) exit(1); stdout_is_a_tty = isatty(STDOUT_FILENO); if (!stdout_is_a_tty && errno == EBADF) eprintf("isatty STDOUT_FILENO:"); path = argv[0]; if (!*path) usage(); if (argc >= 2) first_block = (off_t)parse_uint_arg(argv[1], 1u, (uintmax_t)OFF_MAX); if (argc >= 3) last_block = (off_t)parse_uint_arg(argv[2], (uintmax_t)first_block, (uintmax_t)OFF_MAX); if (blksize > (size_t)(SSIZE_MAX / 4)) eprintf("selected blocksize is too large"); if (bufsize > (size_t)SSIZE_MAX) bufsize = (size_t)SSIZE_MAX; nblkmarks = bufsize / blksize; nblkmarks = MAX(nblkmarks, 2); bufsize = nblkmarks * blksize; text = emalloc(bufsize); first_block *= (off_t)blksize; if (last_block >= 0) { last_block += 1; last_block *= (off_t)blksize; } blkmarks = emalloc(nblkmarks); /* Open file to analyse */ fd = open(path, O_RDONLY); if (fd < 0) eprintf("open %s O_RDONLY:", path); /* Advise the kernel about the access pattern */ posix_fadvise(fd, 0, first_block, POSIX_FADV_DONTNEED); if (last_block > 0) { posix_fadvise(fd, last_block, 0, POSIX_FADV_DONTNEED); posix_fadvise(fd, first_block, last_block, POSIX_FADV_SEQUENTIAL); posix_fadvise(fd, first_block, last_block, POSIX_FADV_NOREUSE); } else { posix_fadvise(fd, first_block, 0, POSIX_FADV_SEQUENTIAL); posix_fadvise(fd, first_block, 0, POSIX_FADV_NOREUSE); } /* Can skip holes? */ if (fstat(fd, &st)) eprintf("fstat %s:", path); if (!S_ISREG(st.st_mode)) skip_holes = 0; /* Skip to first block to analyse */ if (first_block && lseek(fd, first_block, SEEK_SET) == (off_t)-1) { off_t remaining = first_block; if (stderr_is_a_tty) { /* TODO show progress */ fprintf(stderr, "Skipping...\n\033[A"); fflush(stderr); } while (remaining) { uintmax_t n = MIN((uintmax_t)remaining, (uintmax_t)bufsize); ssize_t r = read(fd, text, (size_t)n); if (r <= 0) { if (!r) goto out; if (errno == EINTR) continue; eprintf("read %s:", path); } remaining -= (off_t)r; } } position = first_block; /* Analyse to last block (to end of file if last_block < 0) */ for (; last_block < 0 || first_block < last_block; first_block = data) { if (stderr_is_a_tty) { /* TODO improve output */ fprintf(stderr, "Reading at %ju\033[K\n\033[A", (uintmax_t)position + available); fflush(stderr); } /* Seek to data to find end of hole, and analyse the hole */ if (skip_holes) { data = lseek(fd, first_block, SEEK_DATA); if (data == (off_t)-1) { data = first_block; if (errno != EBADF) skip_holes = 0; else eprintf("lseek %s %ji SEEK_DATA:", path, (intmax_t)first_block); } else { available = encountered_hole(text, available, data - first_block); } } else { data = first_block; } /* Seek to hole to find end of data */ if (skip_holes) { hole = lseek(fd, data, SEEK_HOLE); if (hole == (off_t)-1) { if (errno != EBADF) skip_holes = 0; else eprintf("lseek %s %ji SEEK_HOLE:", path, (intmax_t)data); } else { if (lseek(fd, data, SEEK_SET) == (off_t)-1) eprintf("lseek %s %ji SEEK_SET:", path, (intmax_t)data); } } else { hole = (off_t)-1; } if (hole > last_block) hole = last_block; /* Analyse the data */ for (;;) { size_t n; ssize_t r; if (hole < 0) { n = bufsize - available; } else if (data < hole) { off_t n_off = hole - data; size_t n_size = bufsize - available; n = (uintmax_t)n_off > (uintmax_t)SSIZE_MAX ? n_size : MIN((size_t)n_off, n_size); } else { break; } if (stderr_is_a_tty) { /* TODO improve output */ fprintf(stderr, "Reading at %ju\033[K\n\033[A", (uintmax_t)position + available); fflush(stderr); } r = read(fd, &text[available], n); if (r <= 0) { if (!r) goto eof; if (errno == EINTR) continue; eprintf("read %s:", path); } data += (off_t)r; available += (size_t)r; if (available == bufsize) available = encountered_data(text, available); } } eof: /* Analyse end of file */ available = encountered_data(text, available); if (skip_holes && first_block < st.st_size) available = encountered_hole(text, available, st.st_size - first_block); encountered_eof(text, available); out: if (stderr_is_a_tty) { fprintf(stderr, "\033[K"); fflush(stderr); } close(fd); while (nautomata--) { if (automata[nautomata].type == REGEX) regfree(&automata[nautomata].a.regex); else free(automata[nautomata].a.fixed.kmp); } free(automata); free(text); free(lower_text); free(blkmarks); return 0; }