[voikko] enchant diff

Tue Aug 14 15:20:15 EEST 2007

$ diff -c -r enchant-1.3.0/tests/enchant-ispell.c enchant-1.3.0-hv/tests/enchant-ispell.c
*** enchant-1.3.0/tests/enchant-ispell.c	2005-11-24 22:20:35.000000000 +0200
--- enchant-1.3.0-hv/tests/enchant-ispell.c	2007-08-14 14:39:05.000000000 +0300
***************
*** 1,6 ****
--- 1,7 ----
  /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
  /* enchant
   * Copyright (C) 2003 Dom Lachowicz
+  *               2007 Hannu Väisänen
   *
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
***************
*** 31,36 ****
--- 32,42 ----
  /**
   * This is a rough approximation of an "ispell compatibility mode"
   * for Enchant.
+  *
+  * Modified in 2007 to work when called from emacs which
+  * calls a spelling program (e.g. enchant) like this
+  *
+  * enchant -a -m -d dictionary
   */
  
  #include <stdio.h>
***************
*** 56,61 ****
--- 62,68 ----
  print_version (FILE * to)
  {
  	fprintf (to, "@(#) International Ispell Version 3.1.20 (but really Enchant %s)\n", VERSION);
+ 	fflush (to);
  }
  
  static void
***************
*** 66,71 ****
--- 73,79 ----
  	fprintf (to, "\t-l lists misspellings.\n");
  	fprintf (to, "\t-L displays line numbers.\n");
  	fprintf (to, "\t-v displays program version.\n");
+ 	fflush (to);
  }
  
  static gboolean
***************
*** 95,102 ****
  			g_string_assign (str, utf);
  			g_free (utf);
  		} 
! 		/* else str->str stays the same. we'll assume that it's 
! 		   already utf8 and glib is just being stupid */
  	}
  
  	return ret;
--- 103,110 ----
  			g_string_assign (str, utf);
  			g_free (utf);
  		} 
! 		/* Else str->str stays the same. we'll assume that it's 
! 		   already utf8 and glib is just being stupid. */
  	}
  
  	return ret;
***************
*** 113,121 ****
  		fwrite (native, 1, bytes_written, out);
  		g_free (native);
  	} else {
! 		/* we'll assume that it's already utf8 and glib is just being stupid */
  		fwrite (str, 1, strlen (str), out);
  	}
  }
  
  static void
--- 121,130 ----
  		fwrite (native, 1, bytes_written, out);
  		g_free (native);
  	} else {
! 		/* We'll assume that it's already utf8 and glib is just being stupid. */
  		fwrite (str, 1, strlen (str), out);
  	}
+ 	fflush (out);
  }
  
  static void
***************
*** 124,134 ****
  	size_t n_suggs;
  	char ** suggs;	
  
! 	if (word->len <= MIN_WORD_LENGTH || enchant_dict_check (dict, word->str, word->len) == 0)
  		if (lineCount)
  			fprintf (out, "* %ld\n", lineCount);
  		else
  			fwrite ("*\n", 1, 2, out);
  	else {
  		suggs = enchant_dict_suggest (dict, word->str, 
  					      word->len, &n_suggs);
--- 133,145 ----
  	size_t n_suggs;
  	char ** suggs;	
  
! 	if (word->len <= MIN_WORD_LENGTH || enchant_dict_check (dict, word->str, word->len) == 0) {
  		if (lineCount)
  			fprintf (out, "* %ld\n", lineCount);
  		else
  			fwrite ("*\n", 1, 2, out);
+ 		fflush (out);
+ 	}
  	else {
  		suggs = enchant_dict_suggest (dict, word->str, 
  					      word->len, &n_suggs);
***************
*** 137,143 ****
  			if (lineCount)
  				fprintf (out, "%ld ", lineCount);
  			print_utf (out, word->str);
! 			fprintf (out, " %ld\n", start_pos+1);
  		}
  		else {
  			size_t i = 0;
--- 148,155 ----
  			if (lineCount)
  				fprintf (out, "%ld ", lineCount);
  			print_utf (out, word->str);
! 			fprintf (out, " %ld\n", start_pos);
! 			fflush (out);
  		}
  		else {
  			size_t i = 0;
***************
*** 157,162 ****
--- 169,175 ----
  				else
  					fwrite ("\n", 1, 1, out);
  			}
+ 			fflush (out);
  
  			enchant_dict_free_string_list (dict, suggs);
  		}
***************
*** 171,234 ****
  			fprintf (out, "%ld ", lineCount);
  		print_utf (out, word->str);
  		fwrite ("\n", 1, 1, out);
  	}
  }
  
! /* splits a line into a set of (word,word_position) touples */
  static GSList *
  tokenize_line (GString * line)
  {
  	GSList * tokens = NULL;
- 	size_t start_pos, cur_pos;
  	char *utf = (char *) line->str;
  
  	GString * word;
  	
  	gunichar uc;
! 	
! 	start_pos = cur_pos = 0;
  	word = g_string_new (NULL);
  
  	while (cur_pos < line->len && *utf) {
! 		uc = g_utf8_get_char (utf); 
! 		
! 		switch (g_unichar_type(uc)) {
! 		case G_UNICODE_MODIFIER_LETTER:
! 		case G_UNICODE_LOWERCASE_LETTER:
! 		case G_UNICODE_TITLECASE_LETTER:
! 		case G_UNICODE_UPPERCASE_LETTER:
! 		case G_UNICODE_OTHER_LETTER:
! 		case G_UNICODE_COMBINING_MARK:
! 		case G_UNICODE_ENCLOSING_MARK:
! 		case G_UNICODE_NON_SPACING_MARK:
! 		case G_UNICODE_DECIMAL_NUMBER:
! 		case G_UNICODE_LETTER_NUMBER:
! 		case G_UNICODE_OTHER_NUMBER:
! 		case G_UNICODE_CONNECT_PUNCTUATION:
  			g_string_append_unichar (word, uc);
! 			cur_pos++;
! 			break;
! 		case G_UNICODE_OTHER_PUNCTUATION:
! 			if (uc == '\'') {
! 				g_string_append_unichar (word, uc);
! 				cur_pos++;
! 				break;
! 			}
! 			/* else fall through */
! 		default: /* some sort of non-word character */
! 			if (word->len) {
! 				tokens = g_slist_append (tokens,
! 							 g_string_new_len (word->str, word->len));
! 				tokens = g_slist_append (tokens,
! 							 GINT_TO_POINTER(start_pos));
! 				g_string_truncate (word, 0);
! 				start_pos = ++cur_pos;
! 			}
! 			break;
  		}
- 		utf = g_utf8_next_char (utf);
- 	}
  
  	g_string_free (word, TRUE);
  
  	return tokens;
--- 184,258 ----
  			fprintf (out, "%ld ", lineCount);
  		print_utf (out, word->str);
  		fwrite ("\n", 1, 1, out);
+ 		fflush (out);
+ 	}
+ }
+ 
+ 
+ int
+ is_word_char (gunichar uc)
+ {
+ 	switch (g_unichar_type(uc)) {
+ 	case G_UNICODE_MODIFIER_LETTER:
+ 	case G_UNICODE_LOWERCASE_LETTER:
+ 	case G_UNICODE_TITLECASE_LETTER:
+ 	case G_UNICODE_UPPERCASE_LETTER:
+ 	case G_UNICODE_OTHER_LETTER:
+ 	case G_UNICODE_COMBINING_MARK:
+ 	case G_UNICODE_ENCLOSING_MARK:
+ 	case G_UNICODE_NON_SPACING_MARK:
+ 	case G_UNICODE_DECIMAL_NUMBER:
+ 	case G_UNICODE_LETTER_NUMBER:
+ 	case G_UNICODE_OTHER_NUMBER:
+ 	case G_UNICODE_CONNECT_PUNCTUATION:
+                 return 1;     /* Enchant 1.3.0 defines word chars like this. */
+ 	default:
+ 		return 0;
  	}
  }
  
! 
! /* Splits a line into a set of (word,word_position) touples. */
  static GSList *
  tokenize_line (GString * line)
  {
  	GSList * tokens = NULL;
  	char *utf = (char *) line->str;
  
  	GString * word;
  	
  	gunichar uc;
! 	size_t cur_pos = 0;
! 	size_t start_pos = 0;
  	word = g_string_new (NULL);
  
  	while (cur_pos < line->len && *utf) {
! 
! 	        /* Skip non-word characters. */
! 		cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
! 		uc = g_utf8_get_char (utf);
! 		while (cur_pos < line->len && *utf && !is_word_char(uc)) {
! 		        utf = g_utf8_next_char (utf);
! 			uc = g_utf8_get_char (utf);
! 			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
! 		}
! 		start_pos = cur_pos;
! 
! 		/* Skip over word. */
! 		while (cur_pos < line->len && *utf && is_word_char(uc)) {
  			g_string_append_unichar (word, uc);
! 		        utf = g_utf8_next_char (utf);
! 			uc = g_utf8_get_char (utf);
! 			cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
  		}
  
+ 		/* Save (word, position) touple. */
+                 if (word->len) {
+ 		        tokens = g_slist_append (tokens, g_string_new_len (word->str, word->len));
+ 			tokens = g_slist_append (tokens, GINT_TO_POINTER(start_pos));
+ 			g_string_truncate (word, 0);
+ 		}
+ 	}
  	g_string_free (word, TRUE);
  
  	return tokens;
***************
*** 261,266 ****
--- 285,291 ----
  
  	if (!dict) {
  		fprintf (stderr, "Couldn't create a dictionary for %s\n", lang);
+ 		fflush (stderr);
  		g_free (lang);
  		enchant_broker_free (broker);
  		return 1;
***************
*** 277,283 ****
  			lineCount++;
  
  		if (str->len) {
- 
  			corrected_something = FALSE;
  			token_ptr = tokens = tokenize_line (str);
  			while (tokens != NULL) {
--- 302,307 ----
***************
*** 298,306 ****
  				g_slist_free (token_ptr);
  		} 
  		
! 		if (mode == MODE_A && corrected_something)
  			fwrite ("\n", 1, 1, out);
! 		
  		g_string_truncate (str, 0);
  	}
  	
--- 322,331 ----
  				g_slist_free (token_ptr);
  		} 
  		
! 		if (mode == MODE_A && corrected_something) {
  			fwrite ("\n", 1, 1, out);
! 			fflush (out);
! 		}
  		g_string_truncate (str, 0);
  	}
  	
***************
*** 329,335 ****
  		char * arg = argv[i];
  		if (arg[0] == '-') {
  			if (strlen (arg) == 2) {
! 				/* it seems that the first one of these that is specified gets precedence */
  				if (arg[1] == 'a' && MODE_NONE == mode)
  					mode = MODE_A;
  				else if (arg[1] == 'l' && MODE_NONE == mode)
--- 354,360 ----
  		char * arg = argv[i];
  		if (arg[0] == '-') {
  			if (strlen (arg) == 2) {
! 				/* It seems that the first one of these that is specified gets precedence. */
  				if (arg[1] == 'a' && MODE_NONE == mode)
  					mode = MODE_A;
  				else if (arg[1] == 'l' && MODE_NONE == mode)
***************
*** 338,344 ****
--- 363,376 ----
  					mode = MODE_VERSION;
  				else if (arg[1] == 'L' && MODE_NONE == mode)
  					countLines = 1;
+ 				else if (arg[1] == 'm')
+ 				     	; /* Ignore. Emacs calls ispell with '-m'. */
+ 				else if (arg[1] == 'd')
+ 				     	i++; /* Ignore. Emacs calls ispell with '-d dictionary'. */
  			} 
+ 			else if ((strlen (arg) == 3) && (arg[1] == 'v') && (arg[2] == 'v')) {
+ 			     	mode = MODE_VERSION;   /* Emacs (or ispell.el) calls [ai]spell with '-vv'. */
+ 			}
  			else if (strlen (arg) > 2) {
  				fprintf (stderr, "-%c does not take any parameters.\n", arg[1]);
  				exit(1);