/*
 * (mmap-)ent: computes the binary entropy of a file using the Markov Model
 * (<http://en.wikipedia.org/wiki/Information_entropy#Data_as_a_Markov_process>).
 *
 * Loic Tortay, 2010-2012.
 *
 */

#include <sys/mman.h>
#include <sys/stat.h>

#include <errno.h>
#include <fcntl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "errwarn.h"

const char progname[] = "mmap-ent";

void usage(FILE *);

void usage(FILE *fp)
{
	fprintf(fp,
"\nComputes the binary entropy of a file using the Markov model.\n"
"\nUsage:\n"
"%s [-B bsize] [-M] filename\n"
"\nWhere:\n"
" -B bsize: set the read buffer size to bsize bytes, default is file size.\n"
"    Beware most systems have several constraints on this parameter and its\n"
"    relation to memory page size.\n"
" -M Use madvise/posix_madvise to give hints to the pagecache after the\n"
"    'mmap' (default is to not give hints).\n",
	    progname);

	exit(1);
}


int main(int argc, char *argv[])
{
	size_t		 stats[256];
	struct stat	 st;
	double		 H = 0.0, Pi = 0.0, dsize = 0.0;
	unsigned char	*buffer = NULL;
	char		*filename = NULL;
	size_t		 toread = 0, readsize = 0, bytesread = 0, c;
	int	 	 fd = -1, i = -1, n = 0;
	int		 ch = -1, opt_bsize = 0, opt_madvise = 0;

	/* Parse arguments */
	while ((ch = getopt(argc, argv, ":B:M")) != -1) {
		switch (ch) {
		case 'B':
			opt_bsize = atoi(optarg);
			break;
		case 'M':
			opt_madvise = 1;
			break;
		default:
			usage(stderr);
			break;
		}
	}
	if (optind != (argc - 1)) {
		warning(-1, "File name required");
		usage(stderr);
	}
	filename = argv[optind];

	/* Open target file */
	fd = open(filename, O_RDONLY, 0);
	if (fd == -1)
		error(1, errno, "Unable to open '%s'", filename);

	/* Get target file size */
	memset(stats, 0, sizeof(stats));
	if (fstat(fd, &st) == -1)
		error(1, errno, "Unable to stat '%s'", filename);

	/* Make sure target is a file */
	if (!S_ISREG(st.st_mode))
		error(1, -1, "'%s' is not a regular file", filename);

	toread = (size_t) st.st_size;
	dsize = (double) toread;

	/* Set I/O block size */
	if (opt_bsize != 0)
		readsize = (size_t) opt_bsize;
	else
		readsize = toread;

	/*
	 * Count characters occurences (P[i]).
	 */
	do {
		if (toread < readsize)
			readsize = toread;

		buffer = (unsigned char *) mmap(NULL, readsize, PROT_READ,
		    MAP_PRIVATE, fd, (off_t) bytesread);
		if (buffer == MAP_FAILED)
			error(1, errno, "Unable to mmap('%s')", filename);

		/* If requested, give hints to the system page cache */
		if (opt_madvise) {
#ifdef HAS_POSIX_ADVISORY_INFO
			if (posix_madvise(buffer, readsize,
			    POSIX_MADV_SEQUENTIAL) != 0)
#else
			if (madvise((char *) buffer, readsize,
			    MADV_SEQUENTIAL) != 0)
#endif /* HAS_POSIX_ADVISORY_INFO */
				warning(errno, "Unable to madvise('%s')",
				     filename);
		}
		bytesread += readsize;
		toread -= readsize;

		for (c = 0; c < readsize; c++)
			stats[(int) buffer[c]]++;

		if (munmap((char *) buffer, readsize) == -1)
			warning(errno, "Failed to munmap('%s'@%lu)", filename,
			    (unsigned long) bytesread);
	} while (toread > 0);

	if (close(fd) == -1)
		warning(errno, "Problem closing '%s'", filename);

	/*
	 * Now compute the result:
	 *  H(S) = -sum{i = 0 -> N} (P[i] * log2(P[i])
	 */
	for (i = 0; i < 256; i++) {
		if (stats[i] > 0) {
			Pi = ((double) stats[i]) / dsize;
			H += Pi * log2(Pi);
			n++;
		}
	}
	H = -H;
	printf("H('%s') = %.2lf bits (n = %d, avg(H): %.2f bits)\n", filename,
	    dsize * H, n, H);

	return (0);
}

