// unigram.c
// generated by "uni2c 10 \\jpittman1\c\hwx\inferno\tools\corp2uni\debug\debug.frq unigram.c"
// Time: Tue Dec 22 10:06:58 1998

// JPittman (Dec 22, 1998): Modified corp2uni.exe to include apostrophies within words when they
// do not begin or end the word.  We do this to avoid apostrophies when they were used as
// single quotes, but because of this we will omit 'twas and bitchin' (which probably will
// not make our 10-word frequency cut-off anyway).
#include <common.h>
#include <ctype.h>
#include "trie.h"
#include "sysdict.h"

#define UNIGRAM_MAXCOST 4222

// This version converts the word to lowercase, strips off leading
// and trailing punctuation, and even sums the scores of compound
// words, whether they are delimited by hyphens, ampersands, underscores,
// or whatever.
// The table is lowercase alpha and apostrophies.
// This code treats the apostrophy is if it were another letter in the alphabet.

static int isletter(unsigned char c)
{
	//      upper case                lower case                  apost
	return (((64 < c) && (c < 91)) || ((96 < c) && (c < 123)) || (c == 39));
}

// This is currently very specific to plain ASCII.
// I will convert to a 1252-specific version soon.

static void strlower(unsigned char *s)
{
	for (; *s; s++)
	{
		if ((64 < *s) && (*s < 91))
			*s += 32;
	}
}

static int
LookupInDict(unsigned char *pWord)
{
	DWORD	cost;

	// Look up word as is.
	if (Word2Tag(pWord, &cost)) {
		return (cost & 0x0000FFFF);
	}

	// Look up word with all but first in lower case.
	strlower(pWord + 1);
	if (Word2Tag(pWord, &cost)) {
		return (cost & 0x0000FFFF);
	}

	// Look up word with all in lower case.
	strlower(pWord);
	if (Word2Tag(pWord, &cost)) {
		return (cost & 0x0000FFFF);
	}

	// Not in dict, use default cost.
	return UNIGRAM_MAXCOST;
}

int UnigramCost(unsigned char *szWord)
{
	unsigned char szBuffer[64], *psz, *p;
	int cost = 0;

	// Check word length
	if (63 < strlen(szWord))
		return UNIGRAM_MAXCOST;

	// Put word in local buffer.
	strcpy(szBuffer, szWord);

	// Find first letter.
	psz = szBuffer;
	while (*psz && !isletter(*psz))
		psz++;
	
	// No first letter -> max cost for now.
	if (!*psz)
		return UNIGRAM_MAXCOST;

	// Find last letter
	p = strchr(psz, '\0') - 1;
	while (!isletter(*p))
		p--;
	p++;
	*p = '\0';

	while (*psz)
	{
		p = psz;
		while (isletter(*p))
			p++;

		if (*p)
		{
			*p = '\0';
			p++;
		}
		cost += LookupInDict(psz);
		psz = p;
	}
	return cost;
}
