#ifdef CORRELATE

/* This code exists purely for research purposes.  When the -z option is
 * enabled this code is invoked with the intent of comparing the behaviour
 * of ACD to some other clone detection tool.  It is of no general use
 * since the inputs are hard coded into it.  Only by changing this code
 * on a needs basis might it serve any useful purpose to the general
 * user.
 */

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef WIN32
#define snprintf _snprintf
#endif

#include "object.h"
#include "xmalloc.h"
#include "util.h"
#include "file.h"
#include "location.h"
#include "code.h"
#include "function.h"
#include "source.h"
#include "data.h"

#define SOURCE_TYPE 	0
#define ACD_TYPE    	1
#define CLIC_TYPE		2
#define OUTSIDE_BUILD	4

typedef struct segmentS {
	int					start;
	int					end;	// Lineno
	struct segmentS		*nextP;
} segmentT;

typedef struct {
	int			id;
	int			lines;
	char		*nameP;
	int			type;
	segmentT	*headPP[3];

} source_dataT;

typedef struct {
	int	lineno;
	int unknown1;
	int unknown2;
} line_dataT;

typedef struct {
	int			clone1_file;
	line_dataT	start_clone1;
	line_dataT	end_clone1;
	int			clone2_file;
	line_dataT	start_clone2;
	line_dataT	end_clone2;
	int			weight;
	int			overlap;
} clone_dataT;

//#include "git-data30.h"
#include "git-data60.h"
//#include "git-rcf30.h"
//#include "git-rcf60.h"
//#include "git-data69.h"

int sources;

static int
lookup_source(char *nameP)
{
	source_dataT	*source_dataP;
	char			*name1P;

	for (source_dataP = source_data; ; ++source_dataP) {
		if (!(name1P = source_dataP->nameP)) {
			assert(0);
			source_dataP->id    = source_dataP - source_data;
			source_dataP->nameP = nameP;
			fprintf(stderr, "%s\n", nameP);
			//source_dataP->lines = fileP->m_source_lines;
			assert(source_dataP-source_data == sources);
			++sources;
			assert(sources <= (int) (sizeof(source_data)/sizeof(source_data[0])));
			break;
		}
		if (!strcmp(nameP, name1P)) {
			break;
	}	}
	source_dataP->type |= ACD_TYPE;
	return(source_dataP - source_data);
}

static FILE *dataF;
static FILE *missedF;

extern	char	*g_html_dirP;
extern int		g_report;
extern int		g_min_match_sequence;
extern int		g_min_function_sequence;

static int		 g_outside_build = 0;
static int		 max_code_lth  = 0;
static long long sum_code_lth  = 0;
static long long sum_code2_lth = 0;

void
init_data(void)
{
	source_dataT	*source_dataP;
	char			*nameP;
	clone_dataT		*clone_dataP, *endP;
	clone_dataT		temp;
	long long		lth1, ret;

	ret = snprintf(g_filename, FILENAME_MAX, "%s/correlated.html", g_html_dirP);
	if (ret < 0 || FILENAME_MAX <= ret) {
		fprintf(stderr, "correlated filename too long\n");
		error_exit(1);
	}
	dataF = open_file(g_filename);
	ret = snprintf(g_filename, FILENAME_MAX, "%s/missed.html", g_html_dirP);
	if (ret < 0 || FILENAME_MAX <= ret) {
		fprintf(stderr, "Missed filename too long\n");
		error_exit(1);
	}
	missedF = open_file(g_filename);
	fprintf(dataF, HTML_HEADER "<title>Correlations</title>\n<body>\n<h3>Correlations</h3>\n<p>\nMin lengths=%d/%d\n<p>\n<a href=missed.html>missed</a><p>\n", g_min_match_sequence, g_min_function_sequence);
	fprintf(missedF, HTML_HEADER "<title>ACD missed these clones</title>\n<body>\n<h3>ACD missed these clones</h3>\n<p>\n");

	for (source_dataP = source_data; (nameP = source_dataP->nameP); ++source_dataP) {
		assert((source_dataP - source_data) == source_dataP->id);
		if (sources <= source_dataP->id) {
			sources = source_dataP->id + 1;
		}
		if (!Cfile::exists(nameP)) {
			if (!g_outside_build) {
				fprintf(missedF, "<h3>Outside build</h3>\n<p>\n");
			}
			++g_outside_build;
			fprintf(missedF, "%s\n<BR>\n", nameP);
			source_dataP->type = OUTSIDE_BUILD;
		} else {
			lth1           = source_dataP->lines;
			if (lth1 > max_code_lth) {
				max_code_lth = lth1;
			}
			sum_code_lth  += lth1;
			sum_code2_lth += (lth1 * lth1);
	}	}

	
	clone_dataP = clone_data;
	endP        = clone_data + (sizeof(clone_data)/sizeof(clone_data[0]));
	for (; clone_dataP < endP; ++clone_dataP) {
		// Change from offset 0 to offset 1 to agree with ACD
		clone_dataP->start_clone1.lineno++;
		clone_dataP->end_clone1.lineno++;
		clone_dataP->start_clone2.lineno++;
		clone_dataP->end_clone2.lineno++;
		if (clone_dataP->clone1_file > clone_dataP->clone2_file ||
            (clone_dataP->clone1_file == clone_dataP->clone2_file && clone_dataP->start_clone1.lineno > clone_dataP->start_clone2.lineno)) {
			temp = *clone_dataP;
			clone_dataP->clone1_file  = temp.clone2_file;
			clone_dataP->start_clone1 = temp.start_clone2;
			clone_dataP->end_clone1   = temp.end_clone2;
			clone_dataP->clone2_file  = temp.clone1_file;
			clone_dataP->start_clone2 = temp.start_clone1;
			clone_dataP->end_clone2   = temp.end_clone1;
		}
	}
}

static void
apply_segment1(int source, int start, int end, int type)
{
	source_dataT	*sourceP;
	segmentT		**headPP;
	segmentT		*segmentP, *segment1P;

	sourceP = source_data + source;
	headPP  = sourceP->headPP + type;
	assert (sourceP->lines >= end);
	for (segmentP = *headPP; segmentP; segmentP = segmentP->nextP) {
		if (end   < segmentP->start - 1 ||
			start > segmentP->end + 1) {
			continue;
		}
		if (start < segmentP->start) {
			segmentP->start = start;
		} else {
			start           = segmentP->start;
		}
		if (end > segmentP->end) {
			segmentP->end = end;
		} else {
			end           = segmentP->end;
		}

		while (segment1P = *headPP) {
			if (segment1P == segmentP) {
				goto advance;
			}
			if (end   < segment1P->start - 1 ||
				start > segment1P->end + 1) {
				goto advance;
			}
			if (start > segment1P->start) {
				segmentP->start = start = segment1P->start;
			}
			if (end < segment1P->end) {
				segmentP->end = end = segment1P->end;
			}
			*headPP = segment1P->nextP;
			Xfree(segment1P);
			continue;
advance:	headPP = &(segment1P->nextP);
		}
		return;
	}
	segmentP = (segmentT *) Xmalloc(sizeof(segmentT));
	segmentP->start = start;
	segmentP->end   = end;
	segmentP->nextP = *headPP;
	*headPP         = segmentP;
}

static void
apply_segment(int source, int start, int end, int type)
{
	assert(type);
	apply_segment1(source, start, end, type);
	apply_segment1(source, start, end, 0);
}

static long long g_acd_pairs   = 0;
static long long g_acd_overlap = 0;
static long long g_overlap_area = 0;
static long long g_acd_in_area  = 0;
static long long g_clic_in_area = 0;

static long long g_clic_pairs  = 0;
static long long g_clic_overlap= 0;
static long long g_clic_bad    = 0;			
static long long g_source[4]   = {0};
static long long sum_acd_lth   = 0;
static long long sum_acd2_lth  = 0;
static int		 max_acd_lth   = 0;
static long long cnt_acd_lth   = 0;
static long long sum_clic_lth  = 0;
static long long sum_clic2_lth = 0;
static long long cnt_clic_lth  = 0;
static int       max_clic_lth  = 0;

extern FILE *cloneF;

void
correlate_data(Cfunction *function1P, codeT *start1P, codeT *end1P, Cfunction *function2P, codeT *start2P, codeT *end2P)
{
	Cfunction	*functionP;
	codeT		*codeP;
	int			i, j, temp;
	int			start1, start2, end1, end2;
	clone_dataT	*clone_dataP, *endP;
	int			lth, seen;
	Cfile		*file1P, *file2P;
	const Cfile *default1P, *default2P;
	int		 	lineno1, lineno2, end_lineno1, end_lineno2;
	int			shared_start1, shared_end1, shared_start2, shared_end2;


	i = lookup_source(function1P->m_sourceP->m_fileP->m_nameP);
	j = lookup_source(function2P->m_sourceP->m_fileP->m_nameP);

	if (i > j) {
		functionP  = function1P;
		function1P = function2P;
		function2P = functionP;
		codeP      = start1P;
		start1P    = start2P;
		start2P    = codeP;
		codeP      = end1P;
		end1P      = end2P;
		end2P      = codeP;
		temp       = i;
		i          = j;
		j          = temp;
	}

	file1P = function1P->m_sourceP->m_fileP;
	file2P = function2P->m_sourceP->m_fileP;

	function1P->compute_lineno(start1P, end1P, &start1, &end1);
	function2P->compute_lineno(start2P, end2P, &start2, &end2);

	if (start1 < 0 || start2 < 0 || end1 < 0 || end2 < 0) {
		return;
	}

	if (i == j && start1 > start2) {
		temp   = start1;
		start1 = start2;
		start2 = temp;
		temp   = end1;
		end1   = end2;
		end2   = temp;
	}

	// Data now in same ordering as clone data output

	++g_acd_pairs;

	seen        = 0;
	clone_dataP = clone_data;
	endP    = clone_data + (sizeof(clone_data)/sizeof(clone_data[0]));
	for (; clone_dataP < endP; ++clone_dataP) {
		assert(clone_dataP->clone2_file >= clone_dataP->clone1_file);
		if (i != clone_dataP->clone1_file) {
			continue;
		}
		if (j != clone_dataP->clone2_file) {
			continue;
		}
		lineno1     = clone_dataP->start_clone1.lineno;
		lineno2     = clone_dataP->start_clone2.lineno;
		end_lineno1 = clone_dataP->end_clone1.lineno;
		end_lineno2 = clone_dataP->end_clone2.lineno;

		if (end1   < lineno1 ||
			start1 > end_lineno1 ||
			end2   < lineno2 ||
			start2 > end_lineno2) {
			// No overlap
			continue;
		}
		++(clone_dataP->overlap);
		seen = 1;

		if (start1 < lineno1) {
			shared_start1 = lineno1;
		} else {
			shared_start1 = start1;
		}
		if (end1 < end_lineno1) {
			shared_end1 = end1;
		} else {
			shared_end1 = end_lineno1;
		}
		if (start2 < lineno2) {
			shared_start2 = lineno2;
		} else {
			shared_start2 = start2;
		}
		if (end2 < end_lineno2) {
			shared_end2   = end2;
		} else {
			shared_end2   = end_lineno2;
		}
		g_overlap_area += (shared_end2 - shared_start2 + 1) * (shared_end1 - shared_start1 + 1);
		g_acd_in_area  += (end_lineno2 - lineno2 + 1) * (end_lineno1 - lineno1 + 1);
		g_clic_in_area += (end2 - start2 + 1) * (end1 - start1 + 1);

		if (g_report) {
			fprintf(cloneF,
				"<h3>Correlates with CLIC clone pairs</h3>\n<p>\n"
				"<table><tr><th>%s</th><th>%s</th></tr>\n",
					file1P->m_nameP, file2P->m_nameP);
	
			default1P = file1P;
			default2P = file2P;
			while (lineno1 <= end_lineno1 || lineno2 <= end_lineno2) {
				fputs("<tr><td>", cloneF);
				if (lineno1 <= end_lineno1) {
					file1P->showline(cloneF, &default1P, lineno1);
					++lineno1;
				}
				fputs("</td><td>", cloneF);
				if (lineno2 <= end_lineno2) {
					file2P->showline(cloneF, &default2P, lineno2);
					++lineno2;
				}
				fputs("</td></tr>\n", cloneF);
			}
			fputs("</table>\n", cloneF);
		}
	}
	if (seen) {
		++g_acd_overlap;
	}

	assert (source_data[i].lines >= end1);
	assert (source_data[j].lines >= end2);

	lth = end1 - start1 + 1;
	assert(lth > 0);
	if (lth > max_acd_lth) {
		max_acd_lth = lth;
	}
	sum_acd_lth  += lth;
	sum_acd2_lth += lth*lth;

	lth = end2 - start2 + 1;
	assert(lth > 0);
	if (lth > max_acd_lth) {
		max_acd_lth = lth;
	}
	sum_acd_lth  += lth;
	sum_acd2_lth += lth*lth;
	
	cnt_acd_lth  += 2;

	apply_segment(i, start1, end1, ACD_TYPE);
	apply_segment(j, start2, end2, ACD_TYPE);
}

static void
correlate_clone_data(void)
{
	clone_dataT		*clone_dataP, *endP;
	int	 			start1, start2, end1, end2;
	int				i, j, lth;
	char			*name1P, *name2P;
	Cfile			*file1P, *file2P;
	const Cfile		*default1P, *default2P;
	int				lineno1, lineno2;
	char			*P;
	int				skip;

	clone_dataP = clone_data;
	endP    = clone_data + (sizeof(clone_data)/sizeof(clone_data[0]));
	for (; clone_dataP < endP; ++clone_dataP) {
		i     = clone_dataP->clone1_file;
		j     = clone_dataP->clone2_file;

		assert(i >= 0 && i < sources);
		assert(j >= 0 && j < sources);
		assert (i <= j);

		if (source_data[i].type & OUTSIDE_BUILD) {
			continue;
		}
		if (source_data[j].type & OUTSIDE_BUILD) {
			continue;
		}
		start1 = clone_dataP->start_clone1.lineno;
		end1   = clone_dataP->end_clone1.lineno;
		start2 = clone_dataP->start_clone2.lineno;
		end2   = clone_dataP->end_clone2.lineno;

		assert(start1 <= end1);
		assert(start2 <= end2);
		assert(start1 > 0 && end1 > 0 && start2 > 0 && end2 > 0);

		++g_clic_pairs;

		assert (source_data[i].lines >= end1);
		assert (source_data[j].lines >= end2);

		name1P   = source_data[i].nameP;
		name2P   = source_data[j].nameP;
		file1P   = Cfile::locate(name1P);
		assert(file1P);
		skip = 0;
		P    = "ACD missed clone pair";
		if (i != j) {
			file2P   = Cfile::locate(name2P);
			assert(file2P);
			if (clone_dataP->overlap) {
				++g_clic_overlap;
				P = 0;
			} 
		} else {
			file2P = file1P;
			if (start2 <= end1 && start1 <= end2) {
				++g_clic_bad;
				P = "CLIC overlapped pair";
				skip = 1;
			} else if (clone_dataP->overlap){
				++g_clic_overlap;
				P = 0;
		}	}

		if (P) {
			if (g_report) {
				fprintf(missedF, "<h3>%s</h3>\n", P);
				fprintf(missedF,
					"<p>\n<table><tr><th>%s</th><th>%s</th></tr>\n",
						name1P, name2P);
				default1P = file1P;
				default2P = file2P;
				lineno1   = start1;
				lineno2   = start2;
				while (lineno1 <= end1 || lineno2 <= end2) {
					fputs("<tr><td>", missedF);
					if (lineno1 <= end1) {
						file1P->showline(missedF, &default1P, lineno1);
						++lineno1;
					}
					fputs("</td><td>", missedF);
					if (lineno2 <= end2) {
						file2P->showline(missedF, &default2P, lineno2);
						++lineno2;
					}
					fputs("</td></tr>\n", missedF);
				}
				fputs("</table>\n<p>\n", missedF);
			}
			if (skip) {
				continue;
		}	}

		source_data[i].type |= CLIC_TYPE;
		source_data[j].type |= CLIC_TYPE;

		lth = end1 - start1 + 1;
		if (lth > max_clic_lth) {
			max_clic_lth = lth;
		}
		sum_clic_lth  += lth;
		sum_clic2_lth += lth*lth;
	
		lth = end2 - start2 + 1;
		if (lth > max_clic_lth) {
			max_clic_lth = lth;
		}
		sum_clic_lth  += lth;
		sum_clic2_lth += lth*lth;
		cnt_clic_lth  += 2;

		apply_segment(i, start1, end1, CLIC_TYPE);
		apply_segment(j, start2, end2, CLIC_TYPE);
}	}

static void
compute_statistics(void)
{
	source_dataT	*source_dataP, *end_source_dataP;

	source_dataP = source_data;
	for (end_source_dataP = source_dataP + sources; source_dataP < end_source_dataP; ++source_dataP) {
		if (source_dataP->type & OUTSIDE_BUILD) {
			assert(source_dataP->type == OUTSIDE_BUILD);
			continue;
		}
		g_source[source_dataP->type] += 1;
	}
}

void
report_correlated(void)
{
	int				i;
	double			d, d2;
	char			*P;
	long long		sum;
	source_dataT	*source_dataP;
	segmentT		*segmentP;
	long long		total[3];
	long long		*totalP;
	long long		total_lines;
	double			acd_percent, clics_percent, both_percent, and_percent;
	double			fraction;

	correlate_clone_data();
	compute_statistics();

	total_lines = 0;
	total[0] = 0;
	total[1] = 0;
	total[2] = 0;
	for (source_dataP = source_data; source_dataP->nameP; ++source_dataP) {
        if (source_dataP->type & OUTSIDE_BUILD) {
            continue;
        }
		total_lines += source_dataP->lines;
		for (i = 0; i < 3; ++i) {
			totalP = total + i;
			for (segmentP = source_dataP->headPP[i]; segmentP; segmentP = segmentP->nextP) {
				*totalP += segmentP->end - segmentP->start + 1;
	}	}	}

	fprintf(dataF, "<h3>Statistics</h3>\n<p>\n<table>\n");
	fprintf(dataF, "<tr><th align=right>Source files</th><td>%d</td></tr>\n", (int) sources);
	fprintf(dataF, "<tr><th align=right>Outside build</th><td>%d</td></tr>\n", g_outside_build);
	fprintf(dataF, "<tr><th align=right>Source Lines</th><td>%lld</td></tr>\n", total_lines);
	fprintf(dataF, "<tr><th align=right>Max code lth</th><td>%d</td></tr>\n", max_code_lth);
	d  = ((double) sum_code_lth) / ((double) sources);
	fprintf(dataF, "<tr><th align=right>Avg code lth</th><td>%lg</td></tr>\n", d);
	d2 = ((double) sum_code2_lth) / ((double) sources);
	d  = sqrt(d2 - d*d);
	fprintf(dataF, "<tr><th align=right>Std code lth</th><td>%lg</td></tr>\n", d);
	fprintf(dataF, "<tr><th align=right>CLIC pairs</th><td>%lld</td></tr>\n", g_clic_pairs);
	fprintf(dataF, "<tr><th align=right>CLIC bad</th><td>%lld</td></tr>\n", g_clic_bad);
	fprintf(dataF, "<tr><th align=right>CLIC ok</th><td>%lld</td></tr>\n", g_clic_pairs - g_clic_bad);
	fprintf(dataF, "<tr><th align=right>Total CLIC lth</th><td>%lld</td></tr>\n", sum_clic_lth);
	fprintf(dataF, "<tr><th align=right>Max CLIC lth</th><td>%d</td></tr>\n", max_clic_lth);
	d  = ((double) sum_clic_lth) / ((double) cnt_clic_lth);
	fprintf(dataF, "<tr><th align=right>Avg CLIC lth</th><td>%lg</td></tr>\n", d);
	d2 = ((double) sum_clic2_lth) / ((double) cnt_clic_lth);
	d  = sqrt(d2 - d*d);
	fprintf(dataF, "<tr><th align=right>Std CLIC lth</th><td>%lg</td></tr>\n", d);


	fprintf(dataF, "<tr><th></th><td></td></tr>\n");
	fprintf(dataF, "<tr><th align=right>ACD pairs</th><td>%lld</td></tr>\n", g_acd_pairs);
	fprintf(dataF, "<tr><th align=right>ACD overlap</th><td>%lld</td></tr>\n", g_acd_overlap);
	fprintf(dataF, "<tr><th align=right>CLIC overlap</th><td>%lld</td></tr>\n", g_clic_overlap);
	fprintf(dataF, "<tr><th align=right>ACD%% overlap</th><td>%lg</td></tr>\n",
		((double) (g_acd_overlap  * 100)) / ((double) g_acd_pairs));
	fprintf(dataF, "<tr><th align=right>CLIC%% overlap</th><td>%lg</td></tr>\n",
		((double) (g_clic_overlap * 100)) / ((double) (g_clic_pairs - g_clic_bad)));
	fraction = ((double) g_overlap_area) / ((double) g_acd_in_area);
	fprintf(dataF, "<tr><th align=right>ACD area%% overlap</th><td>%lg</td></tr>\n", fraction * 100);
	fprintf(dataF, "<tr><th align=right>sqrt(ACD area%%)</th><td>%lg</td></tr>\n", sqrt(fraction) * 100);
	
	fraction = ((double) g_overlap_area) / ((double) g_clic_in_area);
	fprintf(dataF, "<tr><th align=right>CLIC area%% overlap</th><td>%lg</td></tr>\n", fraction * 100);
	fprintf(dataF, "<tr><th align=right>sqrt(CLIC area%%)</th><td>%lg</td></tr>\n", sqrt(fraction) * 100);
	
	sum = 0;
	for (i = 0; i < 4; ++i) {
		switch(i) {
		case 0:
			P = "Neither";
			break;
		case ACD_TYPE:
			P = "ACD";
			break;
		case CLIC_TYPE:
			P = "CLIC";
			break;
		case ACD_TYPE | CLIC_TYPE:
			P = "Both";
			break;
		default:
			assert(0);
			P = 0;
		}
		fprintf(dataF, "<tr><th align=right>%s hit source</th><td>%lld</td></tr>\n", P, g_source[i]);
		if (i) {
			sum += g_source[i];
		}
	}
	fprintf(dataF, "<tr><th align=right>total hit source</th><td>%lld</td></tr>\n", sum);

	fprintf(dataF, "<tr><th align=right>Total ACD lth</th><td>%lld</td></tr>\n", sum_acd_lth);
	fprintf(dataF, "<tr><th align=right>Max ACD lth</th><td>%d</td></tr>\n", max_acd_lth);
	d  = ((double) sum_acd_lth) / ((double) cnt_acd_lth);
	fprintf(dataF, "<tr><th align=right>Avg ACD lth</th><td>%lg</td></tr>\n", d);
	d2 = ((double) sum_acd2_lth) / ((double) cnt_acd_lth);
	d  = sqrt(d2 - d*d);
	fprintf(dataF, "<tr><th align=right>Std ACD lth</th><td>%lg</td></tr>\n", d);

	acd_percent   = ((double) (total[ACD_TYPE] * 100)) / ((double) total_lines);
	clics_percent = ((double) (total[CLIC_TYPE]* 100)) / ((double) total_lines);
	both_percent  = ((double) (total[0]        * 100)) / ((double) total_lines);
	and_percent   = acd_percent + clics_percent - both_percent;

	fprintf(dataF, "<tr><th align=right>ACD %%</th><td>%lg%%</td></tr>\n",
		acd_percent);
	fprintf(dataF, "<tr><th align=right>CLICS %%</th><td>%lg%%</td></tr>\n",
		clics_percent);
	fprintf(dataF, "<tr><th align=right>ACD or CLICS %%</th><td>%lg%%</td></tr>\n", 
		both_percent);
	fprintf(dataF, "<tr><th align=right>ACD and CLICS %%</th><td>%lg%%</td></tr>\n", 
		and_percent);
	fprintf(dataF, "<tr><th align=right>ACD | CLICS %%</th><td>%lg%%</td></tr>\n", 
		and_percent * 100.0 / clics_percent);

	fprintf(dataF, "<tr><th align=right>CLICS | ACD %%</th><td>%lg%%</td></tr>\n", 
		and_percent * 100.0 / acd_percent);


	fprintf(missedF, "<P>\nEnd of report\n</body>\n</html>\n");
	fclose(missedF);
	fprintf(dataF, "</table>\n<P>\nEnd of report\n</body>\n</html>\n");
	fclose(dataF);
}

#endif
