/*
 * (block-)ent: computes the binary entropy of a file using the Markov Model
 * (<http://en.wikipedia.org/wiki/Information_entropy#Data_as_a_Markov_process>).
 *
 * Loic Tortay, 2010-2012.
 *
 */

#include <sys/stat.h>

#include <errno.h>
#include <fcntl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "errwarn.h"

const char	progname[] = "block-ent";
const size_t	default_bsize = 1048576U;

void usage(FILE *);

void usage(FILE *fp)
{
	fprintf(fp,
"\nComputes the binary entropy of a file using the Markov model.\n"
"\nUsage:\n"
"%s [-B bsize] [-F] [-S] [-P] filename\n"
"\nWhere:\n"
" -B bsize: set the read block size to bsize bytes, default is minimum between:\n"
"    %lu bytes & file size.\n"
" -F Use posix_fadvise to give hints to the pagecache before reading the file.\n"
#ifndef HAS_POSIX_ADVISORY_INFO
"    This option is a NOOP on this system.\n"
#endif /* !HAS_POSIX_ADVISORY_INFO */
" -P Use \"preferred block size\" (in 'struct stat') to read the file.\n"
" -S Use \"slurp\" mode: try to read the whole file in a single read.\n",
	    progname, (unsigned long) default_bsize);

	exit(1);
}


int main(int argc, char *argv[])
{
	size_t		 stats[256];
	struct stat	 st;
	double		 H = 0.0, Pi = 0.0, dsize = 0.0;
	unsigned char	*buffer = NULL;
	char		*filename = NULL;
	size_t		 toread = 0, bytesread = 0, c, bsize;
	ssize_t		 nr = 0;
	int	 	 fd = -1, i = -1, n = 0;
	int		 ch = -1, opt_bsize = 0;
	int		 opt_fadvise = 0, opt_preferred_bsize = 0;
	int		 opt_slurp_mode = 0;

	/* Parse arguments */
	while ((ch = getopt(argc, argv, ":B:FPS")) != -1) {
		switch (ch) {
		case 'B':
			opt_bsize = atoi(optarg);
			break;
		case 'F':
			opt_fadvise = 1;
			break;
		case 'P':
			opt_preferred_bsize = 1;
			break;
		case 'S':
			opt_slurp_mode = 1;
			break;
		default:
			usage(stderr);
			break;
		}
	}
	if (optind != (argc - 1)) {
		warning(-1, "File name required");
		usage(stderr);
	}
	filename = argv[optind];

	/* Open target file */
	fd = open(filename, O_RDONLY, 0);
	if (fd == -1)
		error(1, errno, "Unable to open '%s'", filename);

	/* Get target file size */
	memset(stats, 0, sizeof(stats));
	if (fstat(fd, &st) == -1)
		error(1, errno, "Unable to stat '%s'", filename);

	/* Make sure target is a file */
	if (!S_ISREG(st.st_mode))
		error(1, -1, "'%s' is not a regular file", filename);

	toread = (size_t) st.st_size;
	dsize = (double) toread;

	if (!opt_slurp_mode) { /* Non whole file reads */
		/* Set I/O block size */
		if (opt_bsize != 0)
			bsize = (size_t) opt_bsize;
		else
			bsize = default_bsize;

		/* Use system reported preferred block size for this file ? */
		if (opt_preferred_bsize) {
			bsize = (size_t) st.st_blksize;
			if (opt_bsize)
				warning(-1, "Both -B & -P used, block size "
				    "will be: %lu\n", (unsigned long) bsize);
		}
	} else /* Whole file read */
		bsize = toread;

	/* Prepare read buffer */
	buffer = malloc(bsize);
	if (buffer == NULL)
		error(1, errno, "Unable to allocate memory for buffer");

#ifdef HAS_POSIX_ADVISORY_INFO
	/* If supported and requested, give hints to the system cache */
	if (opt_fadvise) {
		if (posix_fadvise(fd, 0, st.st_size, POSIX_FADV_SEQUENTIAL) != 0)
			warning(errno, "Unable to fadvise('%s')", filename);
	}
#endif /* HAS_POSIX_ADVISORY_INFO */

	/*
	 * Count characters occurences (P[i]).
	 */
	while (toread > 0) {
		if (toread < bsize)
			bsize = toread;

		nr = read(fd, buffer, bsize);
		if (nr == -1)
			error(1, errno, "Error while reading '%s' at offset %lu",
			    filename, (unsigned long) bytesread);
		else {
			bytesread += (size_t) nr;
			toread -= (size_t) nr;
		}

		for (c = 0; c < (size_t) nr; c++)
			stats[(int) buffer[c]]++;
	}

	free(buffer);
	if (close(fd) == -1)
		warning(errno, "Problem closing '%s'", filename);

	/*
	 * Now compute the result:
	 *  H(S) = -sum{i = 0 -> N} (P[i] * log2(P[i])
	 */
	for (i = 0; i < 256; i++) {
		if (stats[i] > 0) {
			Pi = ((double) stats[i]) / dsize;
			H += Pi * log2(Pi);
			n++;
		}
	}
	H = -H;
	printf("H('%s') = %.2lf bits (n = %d, avg(H): %.2f bits)\n", filename,
	    dsize * H, n, H);

	return (0);
}

