/*
 * (mmap-off-)ent: computes the binary entropy of a file using the Markov Model
 * (<http://en.wikipedia.org/wiki/Information_entropy#Data_as_a_Markov_process>).
 *
 * Loic Tortay, 2010-2012.
 *
 */

#include <sys/mman.h>
#include <sys/stat.h>

#include <errno.h>
#include <fcntl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "errwarn.h"

const char progname[] = "mmap-off-ent";

void usage(FILE *);

void usage(FILE *fp)
{
	fprintf(fp,
"\nComputes the binary entropy of a file using the Markov model.\n"
"\nUsage:\n"
"%s [-B bsize] filename\n"
"\nWhere:\n"
" -B bsize: set the read buffer size to bsize bytes, default is file size.\n",
	    progname);

	exit(1);
}


int main(int argc, char *argv[])
{
	size_t		 stats[256];
	struct stat	 st;
	double		 H = 0.0, Pi = 0.0, dsize = 0.0;
	unsigned char	*buffer = NULL;
	char		*filename = NULL;
	long int	 pagesize = 0;
	off_t		 moff;
	size_t		 boff, pagemask;
	size_t		 toread = 0, readsize = 0, bytesread = 0, c, bsize;
	int	 	 fd = -1, i = -1, n = 0;
	int		 ch = -1, opt_bsize = 0;

	/* Parse arguments */
	while ((ch = getopt(argc, argv, ":B:")) != -1) {
		switch (ch) {
		case 'B':
			opt_bsize = atoi(optarg);
			break;
		default:
			usage(stderr);
			break;
		}
	}
	if (optind != (argc - 1)) {
		warning(-1, "File name required");
		usage(stderr);
	}
	filename = argv[optind];

	/* Open target file */
	fd = open(filename, O_RDONLY, 0);
	if (fd == -1)
		error(1, errno, "Unable to open '%s'", filename);

	/* Get target file size */
	memset(stats, 0, sizeof(stats));
	if (fstat(fd, &st) == -1)
		error(1, errno, "Unable to stat '%s'", filename);

	/* Make sure target is a file */
	if (!S_ISREG(st.st_mode))
		error(1, -1, "'%s' is not a regular file", filename);

	/* Get default pagesize for this host */
	pagesize = sysconf(_SC_PAGESIZE);
	if (pagesize == -1)
		error(1, errno, "Unable to get memory page size");

	pagemask = (size_t) pagesize - 1;

	/* Set I/O block size */
	toread = (size_t) st.st_size;
	dsize = (double) toread;

	if (opt_bsize != 0)
		readsize = (size_t) opt_bsize;
	else
		readsize = toread;

	/*
	 * Count characters occurences (P[i]).
	 */
	do {
		if (toread < readsize)
			readsize = toread;

		/* moff is the page aligned offset in the file */
		moff = (bytesread / pagesize) * pagesize;
		/*
		 * boff is the offset in the page to allow non pagesize aligned
		 * access
		 * Nerdy optimisation of:
		 *	boff = (pagesize + bytesread) % pagesize;
		 */
		boff = bytesread & pagemask;

		if ((boff + readsize) > (size_t) pagesize)
			bsize = (size_t) pagesize - boff;
		else
			bsize = readsize;

		buffer = (unsigned char *) mmap(NULL, bsize, PROT_READ,
		    MAP_PRIVATE, fd, (off_t) moff);
		if (buffer == MAP_FAILED)
			warning(errno, "Unable to mmap('%s')", filename);

		bytesread += bsize;
		toread -= bsize;

		for (c = 0; c < bsize; c++)
			stats[(int) buffer[boff + c]]++;

		if (munmap((char *) buffer, bsize) == -1)
			error(1, errno, "Failed to munmap('%s'@%lu)", filename,
			    (unsigned long) bytesread);
	} while (toread > 0);

	if (close(fd) == -1)
		warning(errno, "Problem closing '%s': %s (%d)\n", filename);

	/*
	 * Now compute the result:
	 *  H(S) = -sum{i = 0 -> N} (P[i] * log2(P[i])
	 */
	for (i = 0; i < 256; i++) {
		if (stats[i] > 0) {
			Pi = ((double) stats[i]) / dsize;
			H += Pi * log2(Pi);
			n++;
		}
	}
	H = -H;
	printf("H('%s') = %.2lf bits (n = %d, avg(H): %.2f bits)\n", filename,
	    dsize * H, n, H);

	return (0);
}

