/*
 * (cstream-)ent: computes the binary entropy of a file using the Markov Model
 * (<http://en.wikipedia.org/wiki/Information_entropy#Data_as_a_Markov_process>).
 *
 * Loic Tortay, 2012.
 *
 */

#include <sys/stat.h>

#include <errno.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "errwarn.h"

const char progname[] = "cstream-ent";
const size_t default_bsize = 1048576U;

void usage(FILE *);

void usage(FILE *fp)
{
	fprintf(fp,
"\nComputes the binary entropy of a file using the Markov model.\n"
"\nUsage:\n"
"%s [-B bsize] [-S] [-Z] filename\n"
"\nWhere:\n"
" -B bsize: set the read block size to bsize bytes, default is %lu bytes.\n"
" -Z bfusize: set the stream buffer size to bsize bytes, default is\n"
"    to let the system chose its own size (which is: %d).\n"
" -S Use \"slurp\" mode: try to read the whole file in a single read.\n",
	    progname, (unsigned long) default_bsize, BUFSIZ);

	exit(1);
}


int main(int argc, char *argv[])
{
	size_t		 stats[256];
	struct stat	 st;
	double		 H = 0.0, Pi = 0.0, dsize = 0.0;
	unsigned char	*buffer = NULL;
	char		*stbuff = NULL;
	char		*filename = NULL;
	FILE		*readfp;
	size_t		 toread = 0, bytesread = 0, bsize = 0, c, ne;
	int	 	 i = -1, n = 0, ch = 0;
	int		 opt_bsize = 0, opt_slurp_mode = 0, opt_buffsize = 0;

	/* Parse arguments */
	while ((ch = getopt(argc, argv, ":B:SZ:")) != -1) {
		switch (ch) {
		case 'B':
			opt_bsize = atoi(optarg);
			break;
		case 'S':
			opt_slurp_mode = 1;
			break;
		case 'Z':
			opt_buffsize = atoi(optarg);
			break;
		default:
			usage(stderr);
			break;
		}
	}
	if (optind != (argc - 1)) {
		warning(-1, "File name required");
		usage(stderr);
	}
	filename = argv[optind];

	/* Open target file */
	readfp = fopen(filename, "rb");
	if (readfp == NULL)
		error(1, errno, "Unable to open '%s'", filename);

	/* Get target file size */
	memset(stats, 0, sizeof(stats));
	if (fstat(fileno(readfp), &st) == -1)
		error(1, errno, "Unable to stat '%s'", filename);

	/* Make sure target is a file */
	if (!S_ISREG(st.st_mode))
		error(1, -1, "'%s' is not a regular file", filename);

	toread = (size_t) st.st_size;
	dsize = (double) toread;

	if (!opt_slurp_mode) { /* Non whole file reads */
		/* Set I/O "block" size */
		if (opt_bsize != 0)
			bsize = (size_t) opt_bsize;
		else
			bsize = default_bsize;
	} else /* Whole file read */
		bsize = toread;

	/* Prepare read buffer */
	buffer = malloc(bsize);
	if (buffer == NULL)
		error(1, errno, "Unable to allocate memory for buffer");

	/* If needed, prepare stream buffer */
	if (opt_buffsize) {
		stbuff = malloc(opt_buffsize);
		if (stbuff == NULL)
			error(1, errno, "Unable to allocate memory for stream "
			    "buffer");

		if (setvbuf(readfp, stbuff, _IOFBF, opt_buffsize) != 0)
			error(1, errno, "Unable to set buffer for stream");
	}

	/*
	 * Count characters occurences (P[i]).
	 */
	while (toread > 0) {
		if (toread < bsize)
			bsize = toread;

		ne = fread(buffer, 1, bsize, readfp);
		if (ne != bsize) {
			if (ferror(readfp)) {
				warning(errno, "Error while reading '%s'",
				    filename);
				break;
			}
			if (feof(readfp))
				break;
		}
		bytesread += ne;
		toread -= ne;

		for (c = 0; c < ne; c++)
			stats[(int) buffer[c]]++;
	}

	free(buffer);
	if (fclose(readfp) == -1)
		warning(errno, "Problem closing '%s'", filename);

	/*
	 * Now compute the result:
	 *  H(S) = -sum{i = 0 -> N} (P[i] * log2(P[i])
	 */
	for (i = 0; i < 256; i++) {
		if (stats[i] > 0) {
			Pi = ((double) stats[i]) / dsize;
			H += Pi * log2(Pi);
			n++;
		}
	}
	H = -H;
	printf("H('%s') = %.2lf bits (n = %d, avg(H): %.2f bits)\n", filename,
	    dsize * H, n, H);

	return (0);
}

