/* See LICENSE file for copyright and license details. */
#include "common.h"

/* TODO limitation: 32 values per channel, limits palette size */
#define WORKING_BITS 5U


struct channel_data {
	const struct libquanta_channel *ch;
	size_t subindex_multiplier;
	size_t colour_shift;
	struct bigint *histogram;
	struct bigint whole;
	struct bigint half;
	struct bigint base;
	long double max;
	size_t cut;
	int has_cut;
};

struct histogram {
	size_t occurrences;
	long double square_sum;
};

struct cube_boundary {
	size_t min; /* exclusive */
	size_t max; /* inclusive */
};

struct cube {
	size_t volume;
	struct cube_boundary bounds[];
};


static void
compute_histogram(struct libquanta_image *image, struct histogram *histogram, struct channel_data *channels, size_t nchannels)
{
	size_t x, width  = channels[0].ch->image_width;
	size_t y, height = channels[0].ch->image_height;
	size_t ch, i, index, pixel;
	long double square_subsum;
	const char *p;
	uint64_t v;

	for (y = 0, pixel = 0; y < height; y++) {
		for (x = 0; x < width; x++, pixel++) {
			index = 0;
			square_subsum = 0;
			for (ch = nchannels; ch--;) {
				i  = y * channels[ch].ch->image_row_size;
				i += x * channels[ch].ch->image_cell_size;
				p = &((const char *)channels[ch].ch->image)[i];
				if (channels[ch].ch->bits_in <= 8)
					v = *(const uint8_t *)p;
				else if (channels[ch].ch->bits_in <= 16)
					v = *(const uint16_t *)p;
				else if (channels[ch].ch->bits_in <= 32)
					v = *(const uint32_t *)p;
				else if (channels[ch].ch->bits_in <= 64)
					v = *(const uint64_t *)p;
				else
					abort();
				square_subsum += (long double)v * (long double)v;
				v >>= channels[ch].colour_shift;
				v += 1U;
				index += v * channels[ch].subindex_multiplier;
				bigint_add_small(&channels[ch].histogram[index], v);
			}
			histogram[index].occurrences += 1U;
			histogram[index].square_sum += square_subsum;
			image->image[pixel] = index;
		}
	}
}


static void
moments_bigint_(size_t channel, const struct channel_data *channels, size_t nchannels,
                struct bigint *out, const struct bigint *in, size_t index)
{
	size_t i, mul;

	if (!nchannels) {
		if (in != out)
			memcpy(&out[index], &in[index], sizeof(*in));
		bigint_add_big(&out[index], &out[index - channels[channel].subindex_multiplier]);
		return;
	}

	mul = channels[--nchannels].subindex_multiplier;
	for (i = 0; i++ < (1U << WORKING_BITS);)
		moments_bigint_(channel, channels, nchannels, out, in, index + i * mul);
}

static void
moments_bigint(size_t channel, struct channel_data *channels, size_t nchannels, struct bigint *buf, size_t bufsize)
{
	struct bigint *in = channels[channel].histogram;
	size_t i;

	memset(buf, 0, bufsize);

	for (i = nchannels; i-- > 1U;) {
		moments_bigint_(i, channels, nchannels, buf, in, 0);
		in = buf;
	}

	moments_bigint_(0, channels, nchannels, channels[channel].histogram, in, 0);
}

static void
moments_histogram_(size_t channel, const struct channel_data *channels, size_t nchannels,
                   struct histogram *out, const struct histogram *in, size_t index)
{
	size_t i, mul;

	if (!nchannels) {
		size_t prev = index - channels[channel].subindex_multiplier;
		out[index].occurrences = out[prev].occurrences + in[index].occurrences;
		out[index].square_sum = out[prev].square_sum + in[index].square_sum;
		return;
	}

	mul = channels[--nchannels].subindex_multiplier;
	for (i = 0; i++ < (1U << WORKING_BITS);)
		moments_histogram_(channel, channels, nchannels, out, in, index + i * mul);
}

static void
moments_histogram(struct histogram *histogram, const struct channel_data *channels, size_t nchannels,
                  struct histogram *buf, size_t bufsize)
{
	struct histogram *in = histogram;
	size_t i;

	memset(buf, 0, bufsize);

	for (i = nchannels; i-- > 1U;) {
		moments_histogram_(i, channels, nchannels, buf, in, 0);
		in = buf;
	}

	moments_histogram_(0, channels, nchannels, histogram, in, 0);
}

static int
compute_cumulative_moments(struct histogram *histogram, struct channel_data *channels, size_t nchannels, size_t bufelems)
{
	void *buf;
	size_t i, bufsize;

	bufsize = bufelems * sizeof(struct bigint);
	buf = malloc(bufsize);
	if (!buf)
		return -1;
	for (i = 0; i < nchannels; i++)
		moments_bigint(i, channels, nchannels, buf, bufsize);
	free(buf);

	bufsize = bufelems * sizeof(struct histogram);
	buf = malloc(bufsize);
	if (!buf)
		return -1;
	moments_histogram(histogram, channels, nchannels, buf, bufsize);
	free(buf);

	return 0;
}


static void
volume_over_square_sums_(const struct cube *cube, const struct histogram *data, const struct channel_data *channels, size_t nchannels,
                         size_t index, int use_addition, long double *result)
{
	size_t ch;

	if (!nchannels) {
		if (use_addition)
			*result += data[index].square_sum;
		else
			*result -= data[index].square_sum;
		return;
	}

	ch = nchannels - 1U;

	volume_over_square_sums_(cube, data, channels, ch,
	                         index + cube->bounds[ch].max * channels[ch].subindex_multiplier,
	                         use_addition ^ 1, result);

	volume_over_square_sums_(cube, data, channels, ch,
	                         index + cube->bounds[ch].min * channels[ch].subindex_multiplier,
	                         use_addition, result);
}

static long double
volume_over_square_sums(const struct cube *cube, const struct histogram *data, const struct channel_data *channels, size_t nchannels)
{
	long double result = 0;
	int use_addition = (int)(~nchannels & 1U);
	volume_over_square_sums_(cube, data, channels, nchannels, 0, use_addition, &result);
	return result;
}


static void
volume_over_occurrences_(const struct cube *cube, const struct histogram *data, const struct channel_data *channels, size_t nchannels,
                         size_t index, int use_addition, uintmax_t *result)
{
	size_t ch;

	if (!nchannels) {
		uintmax_t v = (uintmax_t)data[index].occurrences;
		if (use_addition)
			*result += v;
		else
			*result -= v;
		return;
	}

	ch = nchannels - 1U;

	volume_over_occurrences_(cube, data, channels, ch,
	                         index + cube->bounds[ch].max * channels[ch].subindex_multiplier,
	                         use_addition ^ 1, result);

	volume_over_occurrences_(cube, data, channels, ch,
	                         index + cube->bounds[ch].min * channels[ch].subindex_multiplier,
	                         use_addition, result);
}

static uintmax_t
volume_over_occurrences(const struct cube *cube, const struct histogram *data, const struct channel_data *channels, size_t nchannels)
{
	uintmax_t result = 0;
	int use_addition = (int)(~nchannels & 1U);
	volume_over_occurrences_(cube, data, channels, nchannels, 0, use_addition, &result);
	return result;
}


static void
volume_over_bigints_(const struct cube *cube, const struct bigint *data, const struct channel_data *channels, size_t nchannels,
                     size_t index, int use_addition, struct bigint *result)
{
	size_t ch;

	if (!nchannels) {
		if (use_addition)
			bigint_add_big(result, &data[index]);
		else
			bigint_sub_big(result, &data[index]);
		return;
	}

	ch = nchannels - 1U;

	volume_over_bigints_(cube, data, channels, ch,
	                     index + cube->bounds[ch].max * channels[ch].subindex_multiplier,
	                     use_addition ^ 1, result);

	volume_over_bigints_(cube, data, channels, ch,
	                     index + cube->bounds[ch].min * channels[ch].subindex_multiplier,
	                     use_addition, result);
}

static void
volume_over_bigints(const struct cube *cube, const struct bigint *data, const struct channel_data *channels, size_t nchannels,
                    struct bigint *result)
{
	int use_addition = (int)(~nchannels & 1U);
	result->high = 0;
	result->low = 0;
	volume_over_bigints_(cube, data, channels, nchannels, 0, use_addition, result);
}


static void
bottom_over_occurrences_(const struct cube *cube, size_t channel, const struct histogram *data, const struct channel_data *channels,
                         size_t nchannels, size_t index, int use_addition, uintmax_t *result)
{
	size_t ch;

	if (!nchannels) {
		uintmax_t v = (uintmax_t)data[index].occurrences;
		if (use_addition)
			*result += v;
		else
			*result -= v;
		return;
	}

	ch = nchannels - 1U;

	bottom_over_occurrences_(cube, channel, data, channels, ch,
	                         index + cube->bounds[ch].min * channels[ch].subindex_multiplier,
	                         use_addition, result);

	if (ch == channel)
		return;

	bottom_over_occurrences_(cube, channel, data, channels, ch,
	                         index + cube->bounds[ch].max * channels[ch].subindex_multiplier,
	                         use_addition ^ 1, result);
}

static uintmax_t
bottom_over_occurrences(const struct cube *cube, size_t channel, const struct histogram *data, const struct channel_data *channels,
                        size_t nchannels)
{
	uintmax_t result = 0;
	int use_addition = (int)(~nchannels & 1U);
	bottom_over_occurrences_(cube, channel, data, channels, nchannels, 0, use_addition, &result);
	return result;
}


static void
bottom_over_bigints_(const struct cube *cube, size_t channel, const struct bigint *data, const struct channel_data *channels,
                     size_t nchannels, size_t index, int use_addition, struct bigint *result)
{
	size_t ch;

	if (!nchannels) {
		if (use_addition)
			bigint_add_big(result, &data[index]);
		else
			bigint_sub_big(result, &data[index]);
		return;
	}

	ch = nchannels - 1U;

	bottom_over_bigints_(cube, channel, data, channels, ch,
	                     index + cube->bounds[ch].min * channels[ch].subindex_multiplier,
	                     use_addition, result);

	if (ch == channel)
		return;

	bottom_over_bigints_(cube, channel, data, channels, ch,
	                     index + cube->bounds[ch].max * channels[ch].subindex_multiplier,
	                     use_addition ^ 1, result);
}

static void
bottom_over_bigints(const struct cube *cube, size_t channel, const struct bigint *data, const struct channel_data *channels,
                    size_t nchannels, struct bigint *result)
{
	int use_addition = (int)(~nchannels & 1U);
	result->high = 0;
	result->low = 0;
	bottom_over_bigints_(cube, channel, data, channels, nchannels, 0, use_addition, result);
}


static void
top_over_occurrences_(const struct cube *cube, size_t channel, size_t position, const struct histogram *data,
                      const struct channel_data *channels, size_t nchannels, size_t index, int use_addition, uintmax_t *result)
{
	size_t ch;

	if (!nchannels) {
		uintmax_t v = (uintmax_t)data[index].occurrences;
		if (use_addition)
			*result += v;
		else
			*result -= v;
		return;
	}

	ch = nchannels - 1U;
	
	if (ch == channel) {
		top_over_occurrences_(cube, channel, position, data, channels, ch,
		                      index + position * channels[ch].subindex_multiplier,
		                      use_addition ^ 1, result);

	} else {
		top_over_occurrences_(cube, channel, position, data, channels, ch,
		                      index + cube->bounds[ch].min * channels[ch].subindex_multiplier,
		                      use_addition, result);

		top_over_occurrences_(cube, channel, position, data, channels, ch,
		                      index + cube->bounds[ch].max * channels[ch].subindex_multiplier,
		                      use_addition ^ 1, result);
	}
}

static uintmax_t
top_over_occurrences(const struct cube *cube, size_t channel, size_t position, const struct histogram *data,
                     const struct channel_data *channels, size_t nchannels)
{
	uintmax_t result = 0;
	int use_addition = (int)(~nchannels & 1U);
	top_over_occurrences_(cube, channel, position, data, channels, nchannels, 0, use_addition, &result);
	return result;
}


static void
top_over_bigints_(const struct cube *cube, size_t channel, size_t position, const struct bigint *data,
                  const struct channel_data *channels, size_t nchannels, size_t index, int use_addition, struct bigint *result)
{
	size_t ch;

	if (!nchannels) {
		if (use_addition)
			bigint_add_big(result, &data[index]);
		else
			bigint_sub_big(result, &data[index]);
		return;
	}

	ch = nchannels - 1U;
	
	if (ch == channel) {
		top_over_bigints_(cube, channel, position, data, channels, ch,
		                  index + position * channels[ch].subindex_multiplier,
		                  use_addition ^ 1, result);

	} else {
		top_over_bigints_(cube, channel, position, data, channels, ch,
		                  index + cube->bounds[ch].min * channels[ch].subindex_multiplier,
		                  use_addition, result);

		top_over_bigints_(cube, channel, position, data, channels, ch,
		                  index + cube->bounds[ch].max * channels[ch].subindex_multiplier,
		                  use_addition ^ 1, result);
	}
}

static void
top_over_bigints(const struct cube *cube, size_t channel, size_t position, const struct bigint *data,
                 const struct channel_data *channels, size_t nchannels, struct bigint *result)
{
	int use_addition = (int)(~nchannels & 1U);
	result->high = 0;
	result->low = 0;
	top_over_bigints_(cube, channel, position, data, channels, nchannels, 0, use_addition, result);
}


static void
maximise(const struct cube *cube, size_t channel, const struct histogram *histogram, struct channel_data *channels,
         size_t nchannels, uintmax_t whole_weight)
{
	uintmax_t base_weight;
	uintmax_t half_weight;
	size_t i, pos;
	size_t first, last;
	long double temp, temp2, v;

	channels[channel].max = 0;
	channels[channel].cut = 0;
	channels[channel].has_cut = 0;

	for (i = 0; i < nchannels; i++)
		bottom_over_bigints(cube, channel, channels[i].histogram, channels, nchannels, &channels[i].base);
	base_weight = bottom_over_occurrences(cube, channel, histogram, channels, nchannels);

	first = cube->bounds[channel].min + 1U;
	last = cube->bounds[channel].max;
	for (pos = first; pos < last; pos++) { /* TODO should `last` really be exclusive? */
		half_weight = top_over_occurrences(cube, channel, pos, histogram, channels, nchannels);
		half_weight += base_weight;
		if (!half_weight)
			continue;

		temp = 0;
		for (i = 0; i < nchannels; i++) {
			top_over_bigints(cube, channel, pos, channels[i].histogram, channels, nchannels, &channels[i].half);
			bigint_add_big(&channels[i].half, &channels[i].base);

			v = (long double)channels[i].half.high;
			v = ldexpl(v, 8 * sizeof(channels[i].half.low));
			v += (long double)channels[i].half.low;
			temp = fmal(v, v, temp);
		}
		temp /= (long double)half_weight;

		half_weight = whole_weight - half_weight;
		if (!half_weight)
			continue;

		temp2 = 0;
		for (i = 0; i < nchannels; i++) {
			bigint_rsub_big(&channels[i].half, &channels[i].whole); /* half = whole - half */

			v = (long double)channels[i].half.high;
			v = ldexpl(v, 8 * sizeof(channels[i].half.low));
			v += (long double)channels[i].half.low;
			temp2 = fmal(v, v, temp2);
		}
		temp += temp2 / (long double)half_weight;

		if (temp > channels[channel].max) {
			channels[channel].max = temp;
			channels[channel].cut = pos;
			channels[channel].has_cut = 1;
		}
	}
}


static int
cut(struct cube *cube1, struct cube *cube2, const struct histogram *histogram, struct channel_data *channels, size_t nchannels)
{
	size_t i, ch;
	uintmax_t whole_weight;
	long double max_max;

	for (i = 0; i < nchannels; i++)
		volume_over_bigints(cube1, channels[i].histogram, channels, nchannels, &channels[i].whole);
	whole_weight = volume_over_occurrences(cube1, histogram, channels, nchannels);

	for (i = 0; i < nchannels; i++)
		maximise(cube1, i, histogram, channels, nchannels, whole_weight);

	max_max = channels[0].max;
	ch = 0;
	for (i = 1; i < nchannels; i++) {
		if (channels[i].max > max_max) {
			max_max = channels[i].max;
			ch = i;
		}
	}

	if (!channels[ch].has_cut)
		return 0;

	for (i = 0; i < nchannels; i++) {
		cube2->bounds[i].max = cube1->bounds[i].max;
		cube2->bounds[i].min = cube1->bounds[i].min;
	}

	cube2->bounds[ch].min = cube1->bounds[ch].max = channels[ch].cut;

	cube1->volume = 1U;
	cube2->volume = 1U;
	for (i = 0; i < nchannels; i++) {
		cube1->volume *= cube1->bounds[i].max - cube1->bounds[i].min;
		cube2->volume *= cube2->bounds[i].max - cube2->bounds[i].min;
	}

	return 1;
}


static long double
variance(const struct cube *cube, const struct histogram *histogram, const struct channel_data *channels, size_t nchannels)
{
	uintmax_t weight;
	long double w, x, y = 0;
	struct bigint v;
	size_t i;

	x = volume_over_square_sums(cube, histogram, channels, nchannels);

	for (i = 0; i < nchannels; i++) {
		volume_over_bigints(cube, channels[i].histogram, channels, nchannels, &v);
		w = (long double)v.high;
		w = ldexpl(w, 8 * sizeof(v.low));
		w += (long double)v.low;
		y = fmal(w, w, y);
	}

	weight = volume_over_occurrences(cube, histogram, channels, nchannels);
	y /= (long double)weight;

	return x - y;
}


static void
mark_(const struct cube *cube, size_t label, size_t *tag, const struct channel_data *channels, size_t nchannels, size_t index)
{
	size_t i, ch;

	if (!nchannels) {
		tag[index] = label;
		return;
	}

	ch = nchannels - 1U;
	for (i = cube->bounds[ch].min; i++ < cube->bounds[ch].max;)
		mark_(cube, label, tag, channels, ch, index + i * channels[ch].subindex_multiplier);
}

static void
mark(const struct cube *cube, size_t label, size_t *tag, const struct channel_data *channels, size_t nchannels)
{
	mark_(cube, label, tag, channels, nchannels, 0);
}


static int
partition(struct cube **cubes, const struct histogram *histogram, struct channel_data *channels, size_t nchannels,
          size_t ncolours, size_t *ncubes_out)
{
	size_t i, j, next;
	long double temp;
	long double *v;

	if (ncolours > SIZE_MAX / sizeof(*v)) {
		errno = ENOMEM;
		return -1;
	}
	v = malloc(ncolours * sizeof(*v));
	if (!v)
		return -1;

	for (i = 0; i < nchannels; i++) {
		cubes[0]->bounds[i].min = 0;
		cubes[0]->bounds[i].max = (1U << WORKING_BITS);
	}

	next = 0;
	for (i = 1; i < ncolours; i++) {
		if (cut(cubes[next], cubes[i], histogram, channels, nchannels)) {
			v[next] = cubes[next]->volume > 1U ? variance(cubes[next], histogram, channels, nchannels) : 0U;
			v[i] = cubes[i]->volume > 1U ? variance(cubes[i], histogram, channels, nchannels) : 0U;
		} else {
			v[next] = 0;
			i--;
		}
		next = 0;
		temp = v[0];
		for (j = 0; j++ < i;)
			if (v[i] > temp)
				temp = v[next = j];
		if (temp <= 0) {
			i++;
			break;
		}
	}
	*ncubes_out = i;

	free(v);
	return 0;
}


struct libquanta_image *
libquanta_vquantise_wu(struct libquanta_palette *palette, va_list args)
{
	struct channel_data *channels = NULL;
	const struct libquanta_channel *channel;
	va_list args2;
	size_t ncolours, nchannels;
	size_t width = 0, height = 0;
	size_t i, j, n, subindex_multiplier;
	struct histogram *histogram = NULL;
	struct libquanta_image *ret = NULL;
	size_t size, image_size, *tag = NULL;
	uintmax_t weight, mean;
	struct bigint sum;
	struct cube **cubes = NULL;
	size_t cube_size;
	size_t ncolours_used;

	/* inspect argument `palette`, and get number of output colours */
	if (!palette || !palette->size)
		goto einval;
	ncolours = palette->size;

	/* inspect argument `args`, and get number of input colour channels */
	va_copy(args2, args);
	nchannels = 0;
	for (; (channel = va_arg(args2, const struct libquanta_channel *)); nchannels++) {
		if (!nchannels) {
			width = channel->image_width;
			height = channel->image_height;
		} else if (channel->image_width != width || channel->image_height != height) {
			goto einval;
		}
		if (!channel->bits_in || channel->bits_in > PALETTE_VALUE_MAX_BITS)
			goto einval;
		if (!channel->bits_out || channel->bits_out > channel->bits_in)
			goto einval;
		if (!channel->image && (channel->image_width | channel->image_height))
			goto einval;
	}
	if (!nchannels)
		goto einval;
	va_end(args2);

	/* allocate working memory for each channel, and save channel argument and precompute data */
	channels = calloc(nchannels, sizeof(*channels));
	if (!channels)
		goto fail;
	for (i = 0; (channel = va_arg(args, const struct libquanta_channel *)); i++) {
		channels[i].ch = channel;
		channels[i].histogram = NULL;
	}
	subindex_multiplier = 1U;
	for (i = nchannels; i--; subindex_multiplier *= ((1U << WORKING_BITS) + 1U)) {
		channels[i].subindex_multiplier = subindex_multiplier;
		channels[i].colour_shift = channels[i].ch->bits_in > WORKING_BITS ? channels[i].ch->bits_in - WORKING_BITS : 0U;
	}
	for (i = 0; i < nchannels; i++) {
		channels[i].histogram = calloc(subindex_multiplier, sizeof(*channels[i].histogram));
		if (!channels[i].histogram)
			goto fail;
	}

	/* allocate working memory that is not per channel */
	histogram = calloc(subindex_multiplier, sizeof(*histogram));
	if (!histogram)
		goto fail;
	if (subindex_multiplier > SIZE_MAX / sizeof(*tag))
		goto enomem;
	tag = malloc(subindex_multiplier * sizeof(*tag));
	if (!tag)
		goto fail;
	if (ncolours > SIZE_MAX / sizeof(*cubes))
		goto enomem;
	cubes = malloc(ncolours * sizeof(*cubes));
	if (!cubes)
		goto fail;
	for (i = 0; i < ncolours; i++)
		cubes[i] = NULL;
	if (nchannels > (SIZE_MAX - offsetof(struct cube, bounds)) / sizeof(struct cube_boundary))
		goto enomem;
	cube_size = offsetof(struct cube, bounds);
	cube_size += nchannels * sizeof(struct cube_boundary);
	for (i = 0; i < ncolours; i++) {
		cubes[i] = malloc(cube_size);
		if (!cubes[i])
			goto fail;
	}

	/* allocate and configure colour-quantised image */
	size = sizeof(*ret->image);
	image_size = 0;
	if (width && height) {
		if (width > (SIZE_MAX - offsetof(struct libquanta_image, image)) / size / height)
			goto enomem;
		size *= image_size = width * height;
	}
	size += offsetof(struct libquanta_image, image);
	ret = malloc(offsetof(struct libquanta_image, image) + size);
	if (!ret)
		goto fail;
	ret->width = width;
	ret->height = height;

	/* analyse image */
	compute_histogram(ret, histogram, channels, nchannels);
	if (compute_cumulative_moments(histogram, channels, nchannels, subindex_multiplier))
		goto fail;
	if (partition(cubes, histogram, channels, nchannels, ncolours, &ncolours_used))
		goto fail;

	/* reduce image */
	n = 0;
	for (i = 0; i < ncolours_used; i++) {
		mark(cubes[i], i, tag, channels, nchannels);
		weight = volume_over_occurrences(cubes[i], histogram, channels, nchannels);
		if (!weight)
			continue;
		for (j = 0; j < nchannels; j++) {
			volume_over_bigints(cubes[i], channels[j].histogram, channels, nchannels, &sum);
			mean = bigint_div_small(&sum, weight);
			if (channels[j].ch->bits_out < channels[j].ch->bits_in)
				mean >>= channels[j].ch->bits_in - channels[j].ch->bits_out;
			palette->palette[n * nchannels + j] = (uint64_t)mean;
		}
		n++;
	}
	palette->size = n;
	for (i = 0; i < image_size; i++)
		ret->image[i] = tag[ret->image[i]];

	goto out;

enomem:
	errno = ENOMEM;
	goto fail;

einval:
	errno = EINVAL;
fail:
	free(ret);
	ret = NULL;
out:
	free(tag);
	free(histogram);
	if (channels) {
		for (i = 0; i < nchannels; i++)
			free(channels[i].histogram);
		free(channels);
	}
	if (cubes) {
		for (i = 0; i < ncolours; i++)
			free(cubes[i]);
		free(cubes);
	}
	return ret;
}