diff options
author | Mattias Andrée <maandree@kth.se> | 2021-02-23 01:11:51 +0100 |
---|---|---|
committer | Mattias Andrée <maandree@kth.se> | 2021-02-23 01:12:15 +0100 |
commit | 33d0ee66d80d664aec3511b1827049e538642658 (patch) | |
tree | fa3e0ae6a88ff139bb37923d348562b9d851420c /median | |
parent | update dist (diff) | |
download | median-33d0ee66d80d664aec3511b1827049e538642658.tar.gz median-33d0ee66d80d664aec3511b1827049e538642658.tar.bz2 median-33d0ee66d80d664aec3511b1827049e538642658.tar.xz |
Change license, rewrite in C, remove info manual, remove dist2.0
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to '')
-rwxr-xr-x | median | bin | 0 -> 27752 bytes | |||
-rw-r--r-- | median.1 | 60 | ||||
-rw-r--r-- | median.c | 308 |
3 files changed, 368 insertions, 0 deletions
Binary files differ diff --git a/median.1 b/median.1 new file mode 100644 index 0000000..f4756af --- /dev/null +++ b/median.1 @@ -0,0 +1,60 @@ +.TH MEDIAN 1 MEDIAN +.SH NAME +median - Calculate the median values for a set of groups +.SH SYNOPSIS +.B median +.SH DESCRIPTION +.B median +takes the first blank space-separated column values and +the remainder as keys. It will then print the median +for each key. Lines with the same key form a group. +.PP +.B median +outputs the median value for each key, no order of the +output lines are prescribed. No order is prescribed for +the input lines. +.PP +Lines without a blank space are parsed as having the empty +string as the key, for lines with a blank space, the first +blank space is parsed as part of the key. +.PP +For groups with an even number of elements, if the mean of +the middle two values are used as the median if all values +in the group are numerical, otherwise the lower value is +used as the median. +.SH EXAMPLES +.nf +$ cat <<EOF | median +\-10 +2 +4 +3 +50 +EOF +3 +.fi +.PP +.nf +$ cat <<EOF | median +\+003 c +\&.001 a +\-002 b +\+001 c +\-001 b +\+002 c +EOF +\&.001 a +\-001.5 b +\+002 c +.fi +.SH RATIONALE +Combining +.BR sort (1), +.BR sed (1), +.BR wc (1), +.BR expr (1) +to do this is too much work to do on a regular basis. +.SH "SEE ALSO" +.BR sort (1), +.BR expr (1), +.BR sets (1) diff --git a/median.c b/median.c new file mode 100644 index 0000000..d282426 --- /dev/null +++ b/median.c @@ -0,0 +1,308 @@ +/* See LICENSE file for copyright and license details. */ +#include <ctype.h> +#include <errno.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + + +struct group { + char *key; + struct group *next; + struct group *prev; + char **elems; + size_t nelems; + size_t elems_size; + int numerical; +}; + + +static const char *argv0 = "median"; + +static struct group groups_head; +static struct group groups_tail; + + +static int +isnumerical(const char *s) +{ + if (*s == '+' || *s == '-') + s++; + if (s[0] == '.' && !s[1]) + return 0; + while (isdigit(*s)) + s++; + if (*s == '.') { + s++; + while (isdigit(*s)) + s++; + } + return *s ? 0 : 1; +} + + +static int +cmp_num(const void *apv, const void *bpv) +{ + const char *a = *(const char **)apv; + const char *b = *(const char **)bpv; + int mul = 1; + size_t an = 0, bn = 0, i; + + if (*a == '-' || *b == '-') { + mul = -1; + a = &a[1]; + b = &b[1]; + } else if (*a == '-') { + return -1; + } else if (*b == '-') { + return +1; + } else { + a = &a[*a == '+']; + b = &b[*b == '+']; + } + + while (*a == '0') a++; + while (*b == '0') b++; + + while (a[an] && a[an] != '.') an++; + while (b[bn] && b[bn] != '.') bn++; + + if (an != bn) + return an < bn ? -mul : mul; + + for (i = 0; a[i] && a[i] == b[i]; i++); + a = &a[i]; + b = &b[i]; + + if (i > an) { + if (!*a) + while (*b == '0') b++; + if (!*b) + while (*a == '0') a++; + } + + if (!*a && !*b) { + return 0; + } else if (!*a) { + return -mul; + } else if (!*b) { + return mul; + } else if (*a < *b) { + return -mul; + } else { + return mul; + } +} + + +static int +cmp_str(const void *apv, const void *bpv) +{ + const char *a = *(const char **)apv; + const char *b = *(const char **)bpv; + return strcmp(a, b); +} + + +static int +avg(char *a, const char *b) +{ + size_t i; + int carry = 0, val; + for (i = 0; a[i]; i++) { + val = (a[i] & 15) + (b[i] & 15); + carry = val & 1; + a[i] = (val >> 1) | '0'; + } + return carry; +} + + +static int +subavg(char *a, const char *b) +{ + size_t i; + int carry = 0, val; + for (i = 0; a[i]; i++) { + val = (a[i] & 15) - (b[i] & 15); + carry = val & 1; + a[i] = (val >> 1) | '0'; + } + return carry; +} + + +static void +median2(const char *low, const char *high, const char *key) +{ + int low_plus = *low == '+', low_minus = *low == '-', low_dot; + int high_plus = *high == '+', high_minus = *high == '-', high_dot; + size_t low_int, low_frac = 0, high_int, high_frac = 0; + size_t max_int, max_frac, i; + char *low2, *high2, *tmp; + const char *prefix; + int carry; + + for (low_int = 0; low[low_int] && low[low_int] != '.'; low_int++); + for (high_int = 0; high[high_int] && high[high_int] != '.'; high_int++); + low_dot = low[low_int] == '.'; + high_dot = high[high_int] == '.'; + + low = &low[low_plus | low_minus]; + high = &high[high_plus | high_minus]; + + if (low_dot) + low_frac = strlen(&low[low_int + 1]); + if (high_dot) + high_frac = strlen(&high[high_int + 1]); + + max_int = low_int > high_int ? low_int : high_int; + max_frac = low_frac > high_frac ? low_frac : high_frac; + + low2 = malloc(max_int + max_frac + 1); + high2 = malloc(max_int + max_frac + 1); + if (!low2 || !high2) { + perror(argv0); + exit(1); + } + low2[max_int + max_frac] = '\0'; + high2[max_int + max_frac] = '\0'; + + memset(low2, '0', max_int - low_int); + memcpy(&low2[max_int - low_int], low, low_int); + memcpy(&low2[max_int], &low[low_int + 1], low_frac); + memset(&low2[max_int + low_frac], '0', max_frac - low_frac); + + memset(high2, '0', max_int - high_int); + memcpy(&high2[max_int - high_int], high, high_int); + memcpy(&high2[max_int], &high[high_int + 1], high_frac); + memset(&high2[max_int + high_frac], '0', max_frac - high_frac); + + if (low_minus && high_minus) { + prefix = "-"; + carry = avg(high2, low2); + } else if (low_minus) { + for (i = 0; low2[i] && low2[i] == high2[i]; i++); + if (low2[i] <= high2[i]) { + prefix = "+"; + carry = subavg(high2, low2); + } else { + prefix = "-"; + carry = subavg(low2, high2); + tmp = low2, low2 = high2, high2 = tmp; + } + } else { + prefix = (low_plus || high_plus) ? "+" : ""; + carry = avg(high2, low2); + } + + printf("%s%.*s%s""%s%s%s\n", + prefix, (int)max_int, high2, + (carry || max_frac) ? "." : "", &high2[max_int], carry ? "5" : "", key); + + free(low2); + free(high2); +} + + +int +main(int argc, char *argv[]) +{ + char *line = NULL, *key; + struct group *group = NULL; + size_t size = 0; + ssize_t len; + + groups_head.next = &groups_tail; + groups_tail.prev = &groups_head; + + if (argc) { + argv0 = *argv++, argc--; + if (argc && argv[0][0] == '-' && argv[0][1] == '-' && !argv[0][2]) + argv++, argc--; + if (argc) { + fprintf(stderr, "usage: %s\n", argv0); + return 1; + } + } + + while ((len = getdelim(&line, &size, '\n', stdin)) > 0) { + if (len && line[--len] == '\n') + line[len] = '\0'; + + for (key = line; *key && !isspace(*key); key++); + if (group && !strcmp(group->key, key)) + goto found_group; + for (group = groups_head.next; group->key; group = group->next) + if (!strcmp(group->key, key)) + goto found_group; + group = calloc(1, sizeof(*group)); + if (!group) { + perror(argv0); + return 1; + } + group->key = strdup(key); + if (!group->key) { + perror(argv0); + return 1; + } + group->numerical = 1; + group->prev = groups_tail.prev; + group->next = &groups_tail; + groups_tail.prev->next = group; + groups_tail.prev = group; + + found_group: + if (group->nelems == group->elems_size) { + if (group->elems_size > SIZE_MAX / 2 / sizeof(*group->elems)) { + errno = ENOMEM; + perror(argv0); + return 1; + } + group->elems_size = group->elems_size ? group->elems_size * 2 : 16; + group->elems = realloc(group->elems, group->elems_size * sizeof(*group->elems)); + if (!group->elems) { + perror(argv0); + return 1; + } + } + *key = '\0'; + group->elems[group->nelems] = strdup(line); + if (!group->elems[group->nelems]) { + perror(argv0); + return 1; + } + if (group->numerical) + if (!isnumerical(line)) + group->numerical = 0; + group->nelems++; + } + + if (ferror(stdin)) { + perror(argv0); + return 1; + } + + free(line); + while ((group = groups_head.next)->key) { + qsort(group->elems, group->nelems, sizeof(*group->elems), group->numerical ? cmp_num : cmp_str); + if (group->nelems % 2 || !group->numerical) + printf("%s%s\n", group->elems[(group->nelems - 1) / 2], group->key); + else if (group->nelems) + median2(group->elems[group->nelems / 2 - 1], group->elems[group->nelems / 2 - 0], group->key); + groups_head.next = group->next; + while (group->nelems--) + free(group->elems[group->nelems]); + free(group->elems); + free(group->key); + free(group); + } + + if (fflush(stdout) || ferror(stdout) || fclose(stdout)) { + perror(argv0); + return 1; + } + return 0; +} |