/*******************************************************************************
+
+	cluster.cc
+
+   Copyright (C) 2000
+	Kevin Pulo, kev@hons.cs.usyd.edu.au.
+	Garrick Welsh, gaz@hons.cs.usyd.edu.au.
+
+	This program is free software; you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation; either version 2 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to the Free Software
+	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+	$Id: cluster.cc,v 1.10 2000/05/28 11:31:19 kev Exp kev $
+
*******************************************************************************/

#include "cluster.hh"
#include "algorithms.hh"


static ofstream null("/dev/null");
ostream *debug;

static const char *rcsid = "$Id: cluster.cc,v 1.10 2000/05/28 11:31:19 kev Exp kev $";
static const char *rcsrevision = "$Revision: 1.10 $";
static const char *usage = "Usage: cluster [options]\n"
"Options:\n"
"    --help, -help, -h  prints this usage message\n"
"    --algorithm algorithm_name [k]\n"
"        Specifies the clustering algorithm to use, and optionally a suggestion\n"
"        for the number of clusters (k) for those algorithms which need or\n"
"        benefit from it. algorithm_name defaults to 'kmeans', k defaults to 4.\n"
"\n"
"Available algorithms:\n"
"* kmeans:     The standard k-means algorithm. Requires k.\n"
"* tb:         The Teitz-Bart k-medoids heuristic. Requires k.\n"
"* ech:        The ECH variant of the TB k-medoids heuristic.\n"
"* random:     Random clustering (equal sized clusters). Requires k.\n"
"* cheat:      Cheating clustering, using the annotation.\n"
"* ascending:  Puts each point into its own cluster.\n"
"* single:     Puts all points into a single cluster.\n"
;


point_set T;
dictionary<point, string> annotation;
array<Cluster> clusters;
Cluster outliers;

void badCommand(void)
{
	cerr << "Illegal Command" << endl << endl << usage << endl;
	exit(1);
}


/* Skips any spaces and tabs in the stream. */
istream &skipspace(istream &in) {
	int ch;

	ch = in.peek();
	while (ch == ' ' || ch == '\t') {
		in.ignore(1);
		ch = in.peek();
	}

	return in;
}


/* ANSI C++ iostream is useless for reading in everything up to EOL.
   getline() sucks because it needs to be told how large the buffer is,
   and will extract the \n without returning it.  Hence you cannot
   unambiguously know if you've hit the buffer limit or EOL.
   Thus I do it myself.  If you don't like that this method is potentially
   very slow, talk to the ANSI C++ committee, not me.
*/
string readline(istream &in) {
	const int buflen = 1024;
	char s[buflen];
	string res;
	int ch;

	while (true) {
		in.get(s, buflen, '\n');
		res += s;

		ch = in.peek();
		if (ch == EOF) {
			break;
		} else if (ch == '\n') {
			in.ignore(1);
			break;
		}
	}
	return res;
}



static void debugPrintDataset() {
	point p;
	list<point> L = T.points();

	(*debug) << endl;
	(*debug) << "DATASET:" << endl;
	L = T.points();
	forall(p, L) {
		(*debug) << p << " " << annotation.access(p) << endl;
	}
	(*debug) << endl;
}


int main(int argc, char *argv[]) {
	point p;
	string s;
	list<point> L;
	string algorithm;
	int k;
	int i;


	// Defaults
	// Non portable, bad form, etc.  Like I care at this point in time.
	debug = &null;
	k = 4;
	algorithm = "kmeans";


	////////////////////
	// Process command line arguments.
	////////////////////
	for (i = 1; i < argc; i++) {
		if ( (string(argv[i]) == "--debug") || (string(argv[i]) == "-debug") ) {
			debug = &cerr;
		} else if ( (string(argv[i]) == "--algorithm") || (string(argv[i]) == "-algorithm") ) {
			i++;
			if (i >= argc) {
				cerr << "ERROR: Usage: --algorithm algorithmname [k]" << endl;
				return 1;
			}
			algorithm = argv[i];

			if ( (i < argc - 1) && (argv[i+1][0] != '-') ) {
				i++;
				k = atoi(argv[i]);
			}
		} else if(string(argv[i]) == "--help" || string(argv[i]) == "-help" || string(argv[i]) == "-h") {
			cout << usage << endl;
			return 0;
		} else {
			badCommand();
		}
	}


	////////////////////
	// Read the dataset.
	////////////////////
	while (true) {
		// Read neccessary bits.
		cin >> p;

		// If any neccesary bits failed, we don't have a complete line,
		// so die.
		if (cin.eof()) {
			break;
		}

		// Read optional bits.
		// EOF in here is fine - incomplete last line, that's all.
		skipspace(cin);
		s = readline(cin);

		// Do stuff with the input.
		if (T.lookup(p) == nil) {
			(*debug) << "ADDING: " << p << " " << s << endl;
			T.insert(p);
			annotation.insert(p, s);
		} else {
			cerr << "WARNING: Duplicate point " << p << " ignored." << endl;
		}
	}


	registerAlgorithms();


	////////////////////
	// Do stuff with the dataset.
	// Clustering would be a good idea.
	////////////////////
	debugPrintDataset();
	runAlgorithm(algorithm, k);
	(*debug) << "Algorithm completed" << endl;


	////////////////////
	// Write the clustered dataset out.
	////////////////////
	list<point> &pts = outliers.getList();
	forall(p, pts) {
		cout << -1 << " " << p << " " << annotation.access(p) << endl;
	}
	i = 0;
	Cluster c;
	forall(c, clusters) {
		cout << i << " " << c.rep << " representative" << endl;
		list<point> &pts = c.getList();
		forall(p, pts) {
			cout << i << " " << p << " " << annotation.access(p) << endl;
		}
		i++;
	}


	return 0;
}

