[voikko] enchant-ispell diff

Mon Sep 10 11:49:16 EEST 2007

Liitteenä.
-------------- next part --------------

--- enchant/tests/enchant-ispell.c	2007-08-15 07:12:04.000000000 +0300
+++ enchant-hv/tests/enchant-ispell.c	2007-09-07 14:19:09.000000000 +0300
@@ -1,6 +1,7 @@
 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
 /* enchant
  * Copyright (C) 2003 Dom Lachowicz
+ *               2007 Hannu Väisänen
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -31,6 +32,11 @@
 /**
  * This is a rough approximation of an "ispell compatibility mode"
  * for Enchant.
+ *
+ * Modified in 2007 to work when called from emacs which
+ * calls a spelling program (e.g. enchant) like this
+ *
+ * enchant -a -m -d dictionary
  */
 
 #include <stdio.h>
@@ -44,6 +50,7 @@
 /* word has to be bigger than this to be checked */
 #define MIN_WORD_LENGTH 1
 
+
 typedef enum 
 	{
 		MODE_NONE,
@@ -56,14 +63,17 @@
 print_version (FILE * to)
 {
 	fprintf (to, "@(#) International Ispell Version 3.1.20 (but really Enchant %s)\n", VERSION);
+	fflush (to);
 }
 
 static void
 print_help (FILE * to, const char * prog)
 {
-	fprintf (to, "Usage: %s [options] -a|-l|-L|-v[v]|<file>\n", prog);
+	fprintf (to, "Usage: %s [options] -a|-d dict|-l|-L|-m|-v[v]|<file>\n", prog);
 	fprintf (to, "\t-a lists alternatives.\n");
+	fprintf (to, "\t-d dict uses dictionary <dict>.\n");
 	fprintf (to, "\t-l lists misspellings.\n");
+	fprintf (to, "\t-m is ignored.\n");
 	fprintf (to, "\t-L displays line numbers.\n");
 	fprintf (to, "\t-v displays program version.\n");
 }
@@ -95,8 +105,8 @@
 			g_string_assign (str, utf);
 			g_free (utf);
 		} 
-		/* else str->str stays the same. we'll assume that it's 
-		   already utf8 and glib is just being stupid */
+		/* Else str->str stays the same. we'll assume that it's 
+		   already utf8 and glib is just being stupid. */
 	}
 
 	return ret;
@@ -113,7 +123,7 @@
 		fwrite (native, 1, bytes_written, out);
 		g_free (native);
 	} else {
-		/* we'll assume that it's already utf8 and glib is just being stupid */
+		/* We'll assume that it's already utf8 and glib is just being stupid. */
 		fwrite (str, 1, strlen (str), out);
 	}
 }
@@ -124,11 +134,12 @@
 	size_t n_suggs;
 	char ** suggs;	
 
-	if (word->len <= MIN_WORD_LENGTH || enchant_dict_check (dict, word->str, word->len) == 0)
+	if (word->len <= MIN_WORD_LENGTH || enchant_dict_check (dict, word->str, word->len) == 0) {
 		if (lineCount)
 			fprintf (out, "* %ld\n", lineCount);
 		else
 			fwrite ("*\n", 1, 2, out);
+	}
 	else {
 		suggs = enchant_dict_suggest (dict, word->str, 
 					      word->len, &n_suggs);
@@ -137,7 +148,7 @@
 			if (lineCount)
 				fprintf (out, "%ld ", lineCount);
 			print_utf (out, word->str);
-			fprintf (out, " %ld\n", start_pos+1);
+			fprintf (out, " %ld\n", start_pos);
 		}
 		else {
 			size_t i = 0;
@@ -174,68 +185,164 @@
 	}
 }
 
-/* splits a line into a set of (word,word_position) touples */
+
+static int
+is_word_char (gunichar uc, size_t n)
+{
+	switch (g_unichar_type(uc)) {
+	case G_UNICODE_MODIFIER_LETTER:
+	case G_UNICODE_LOWERCASE_LETTER:
+	case G_UNICODE_TITLECASE_LETTER:
+	case G_UNICODE_UPPERCASE_LETTER:
+	case G_UNICODE_OTHER_LETTER:
+	case G_UNICODE_COMBINING_MARK:
+	case G_UNICODE_ENCLOSING_MARK:
+	case G_UNICODE_NON_SPACING_MARK:
+	case G_UNICODE_DECIMAL_NUMBER:
+	case G_UNICODE_LETTER_NUMBER:
+	case G_UNICODE_OTHER_NUMBER:
+	case G_UNICODE_CONNECT_PUNCTUATION:
+                return 1;     /* Enchant 1.3.0 defines word chars like this. */
+	default:
+		if ((n > 0) && (uc == g_utf8_get_char("'"))) {
+		        return 1;  /** Char ' is accepted only within a word. */
+		}
+		return 0;
+	}
+}
+
+
+typedef struct lang_map {
+  char *ispell;
+  char *enchant;
+} LangMap;
+
+
+/* Maps ispell language codes to enchant language codes. */
+/* The list is partially taken from src/ispell/ispell_checker.cpp. */
+static const LangMap lingua[] = {
+	{"american",	"en_US"},
+	{"brazilian",	"pt_BR"},
+	{"british",	"en_GB"},
+	{"bulgarian",	"bg"},
+	{"catala",	"ca"},
+	{"catalan",	"ca"},
+	{"danish",	"da"},
+	{"dansk",	"da"},
+	{"deutsch",	"de"},
+	{"dutch",	"nl"},
+	{"ellhnika",	"el"},
+	{"espanol",	"es"},
+	{"esperanto",	"eo"},
+	{"estonian",	"et"},
+	{"faeroese",	"fo"},
+	{"finnish",	"fi"},
+	{"francais",	"fr"},
+	{"french",	"fr"},
+	{"galician",	"gl"},
+	{"german",	"de"},
+	{"hungarian",	"hu"},
+	{"interlingua",	"ia"},
+	{"irish",	"ga"},
+	{"italian",	"it"},
+	{"latin",	"la"},
+	{"lietuviu",	"lt"},
+	{"lithuanian",	"lt"},
+	{"mlatin",	"la"},
+	{"nederlands",	"nl"},
+	{"norsk",	"no"},
+	{"norwegian",	"no"},
+	{"nynorsk",	"nn"},
+	{"polish",	"pl"},
+	{"portugues",	"pt"},
+	{"portuguese",	"pt"},
+	{"russian",	"ru"},
+	{"sardinian",	"sc"},
+	{"slovak",	"sk"},
+	{"slovenian",	"sl"},
+	{"slovensko",	"sl"},
+	{"spanish",	"es"},
+	{"suomi",	"fi"},   /* For Emacs/Voikko/tmispell compatibility. */
+	{"svenska",	"sv"},
+	{"swedish",	"sv"},
+	{"swiss",	"de_CH"},
+	{"ukrainian",	"uk"},
+	{"yiddish-yivo",	"yi"},
+	{NULL,       NULL}    /* Last item must be {NULL, NULL}. */
+};
+
+
+/* Converts ispell language code to enchant language code. */
+static gchar *
+convert_language_code (gchar *code)
+{
+	size_t i;
+	for (i = 0; lingua[i].ispell; i++) {
+	        if (!strcmp(code,lingua[i].ispell)) {
+			/* We must call g_strdup() because the calling program g_free()s the result. */
+		        return g_strdup (lingua[i].enchant);
+		}
+	}
+	/* Let's call g_strdup() here too! */
+	return g_strdup (code);
+}
+
+
+/* Splits a line into a set of (word,word_position) touples. */
 static GSList *
 tokenize_line (GString * line)
 {
 	GSList * tokens = NULL;
-	size_t start_pos, cur_pos;
 	char *utf = (char *) line->str;
 
 	GString * word;
 	
 	gunichar uc;
-	
-	start_pos = cur_pos = 0;
+	size_t cur_pos = 0;
+	size_t start_pos = 0;
 	word = g_string_new (NULL);
 
 	while (cur_pos < line->len && *utf) {
-		uc = g_utf8_get_char (utf); 
-		
-		switch (g_unichar_type(uc)) {
-		case G_UNICODE_MODIFIER_LETTER:
-		case G_UNICODE_LOWERCASE_LETTER:
-		case G_UNICODE_TITLECASE_LETTER:
-		case G_UNICODE_UPPERCASE_LETTER:
-		case G_UNICODE_OTHER_LETTER:
-		case G_UNICODE_COMBINING_MARK:
-		case G_UNICODE_ENCLOSING_MARK:
-		case G_UNICODE_NON_SPACING_MARK:
-		case G_UNICODE_DECIMAL_NUMBER:
-		case G_UNICODE_LETTER_NUMBER:
-		case G_UNICODE_OTHER_NUMBER:
-		case G_UNICODE_CONNECT_PUNCTUATION:
+
+	        /* Skip non-word characters. */
+		cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
+		uc = g_utf8_get_char (utf);
+		while (cur_pos < line->len && *utf && !is_word_char(uc,0)) {
+		        utf = g_utf8_next_char (utf);
+			uc = g_utf8_get_char (utf);
+			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
+		}
+		start_pos = cur_pos;
+
+		/* Skip over word. */
+		while (cur_pos < line->len && *utf && is_word_char(uc,1)) {
 			g_string_append_unichar (word, uc);
-			cur_pos++;
-			break;
-		case G_UNICODE_OTHER_PUNCTUATION:
-			if (uc == '\'') {
-				g_string_append_unichar (word, uc);
-				cur_pos++;
-				break;
-			}
-			/* else fall through */
-		default: /* some sort of non-word character */
-			if (word->len) {
-				tokens = g_slist_append (tokens,
-							 g_string_new_len (word->str, word->len));
-				tokens = g_slist_append (tokens,
-							 GINT_TO_POINTER(start_pos));
-				g_string_truncate (word, 0);
-				start_pos = ++cur_pos;
-			}
-			break;
+		        utf = g_utf8_next_char (utf);
+			uc = g_utf8_get_char (utf);
+			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
+		}
+
+	        /* Do not accept one or more  ' at the end of the word. */
+		int i = word->len-1;
+	        while ((i >= 0) && (word->str[i] == '\'')) {
+	                g_string_truncate (word, i);
+			i--;
 		}
-		utf = g_utf8_next_char (utf);
-	}
 
+		/* Save (word, position) touple. */
+                if (word->len) {
+		        tokens = g_slist_append (tokens, g_string_new_len (word->str, word->len));
+			tokens = g_slist_append (tokens, GINT_TO_POINTER(start_pos));
+			g_string_truncate (word, 0);
+		}
+	}
 	g_string_free (word, TRUE);
 
 	return tokens;
 }
 
 static int
-parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines)
+parse_file (FILE * in, FILE * out, IspellMode_t mode, int countLines, gchar *dictionary)
 {
 	EnchantBroker * broker;
 	EnchantDict * dict;
@@ -250,9 +357,14 @@
 	if (mode == MODE_A)
 		print_version (out);
 
-	lang = enchant_get_user_language();
-	if(!lang)
-		return 1;
+	if (dictionary) {
+		lang = convert_language_code (dictionary);
+	}
+	else {
+	        lang = enchant_get_user_language();
+		if(!lang)
+			return 1;
+ 	}
 
 	/* Enchant will get rid of useless trailing garbage like de_DE at euro or de_DE.ISO-8859-15 */
 	
@@ -277,7 +389,6 @@
 			lineCount++;
 
 		if (str->len) {
-
 			corrected_something = FALSE;
 			token_ptr = tokens = tokenize_line (str);
 			while (tokens != NULL) {
@@ -293,17 +404,17 @@
 				else if (mode == MODE_L)
 					do_mode_l (out, dict, word, lineCount);
 			}
-
 			if (token_ptr)
 				g_slist_free (token_ptr);
 		} 
 		
-		if (mode == MODE_A && corrected_something)
+		if (mode == MODE_A && corrected_something) {
 			fwrite ("\n", 1, 1, out);
-		
+		}
 		g_string_truncate (str, 0);
+		fflush (out);
 	}
-	
+
 	enchant_broker_free_dict (broker, dict);
 	enchant_broker_free (broker);
 
@@ -324,12 +435,14 @@
 	FILE * fp = stdin;
 
 	int countLines = 0;
-	
+	gchar *dictionary = 0;  /* -d dictionary */
+
+
 	for (i = 1; i < argc; i++) {
 		char * arg = argv[i];
 		if (arg[0] == '-') {
 			if (strlen (arg) == 2) {
-				/* it seems that the first one of these that is specified gets precedence */
+				/* It seems that the first one of these that is specified gets precedence. */
 				if (arg[1] == 'a' && MODE_NONE == mode)
 					mode = MODE_A;
 				else if (arg[1] == 'l' && MODE_NONE == mode)
@@ -338,7 +451,19 @@
 					mode = MODE_VERSION;
 				else if (arg[1] == 'L' && MODE_NONE == mode)
 					countLines = 1;
+				else if (arg[1] == 'm')
+				     	; /* Ignore. Emacs calls ispell with '-m'. */
+				else if (arg[1] == 'd') {
+				     	i++;
+					dictionary = argv[i];  /* Emacs calls ispell with '-d dictionary'. */
+				}
 			} 
+			else if ((strlen (arg) == 3) && (arg[1] == 'v') && (arg[2] == 'v')) {
+			     	mode = MODE_VERSION;   /* Emacs (or ispell.el) calls [ai]spell with '-vv'. */
+			}
+			else if (arg[1] == 'd') {
+			        dictionary = arg + 2;  /* Accept "-ddictionary", i.e. no space between -d and dictionary. */
+			}
 			else if (strlen (arg) > 2) {
 				fprintf (stderr, "-%c does not take any parameters.\n", arg[1]);
 				exit(1);
@@ -365,7 +490,7 @@
 			}
 		}
 		
-		rval = parse_file (fp, stdout, mode, countLines);
+		rval = parse_file (fp, stdout, mode, countLines, dictionary);
 		
 		if (file)
 			fclose (fp);