// --------------------------- lispify.cc --------------------------------
// input an Esperanto sentence from standard input
// output a sentence in lisp format to standard output
// affixes are parsed by this program
// punctuations are converted as well
// Jui-Yuan Fred Hsu.  May 1995 
//  
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2, or (at your option)
// any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.


// ------------------ customization ------------------------------------


#define  PUNC_SEPARATORS " \t\n"


#define LISP_BUF_SIZE  1000
#define ORIG_BUF_SIZE  4000

// ----------------------------------------------------------------------

#include  "Debug.h"
#undef    LOCAL_DEBUG
#define   LOCAL_DEBUG  0

#include "common.h"          // common definitions
#include "Collect.h"         // Collection of object (dynamic)
#include "str.h"             // String class
#include "strtok.h"          // String Token parser

#include <fstream.h>         // file stream processing

typedef  Collect<StrTok>      StrTokArray;         // Array of String tokens


// ---------------------- check_punc -------------------------------------
// check next character for punctuation.
// If it's not punctuation, return 0
// If it's punctuation  return 1
// If it is SEPARATOR character, return 1
// ----------------------------------------------------------------------
int check_punc(char c)
{
  if (strchr(PUNC_SEPARATORS, c)) return 1;

  for (int i=0; i<PUNC_NUM; i++)
    if (punc_c[i][0] == c) return 1;

  return 0;
}


// -------------------- print_punc -----------------------------------------
// print punctuators:
// only print out real punctuators, not separators
// return the original puctuator's address (as a string), if found
// return NULL, if not found
// ----------------------------------------------------------------------
char* print_punc(char c)
{
  for (int i=0; i<PUNC_NUM; i++)
    if (punc_c[i][0] == c) { cout << punc_str[i] <<' '; break; }
  
  if (i==PUNC_NUM) return NULL;
  else             return punc_c[i];
}


#define ADJ_NONE  0
#define ADJ_LEFT  1
#define ADJ_RIGHT 2


// ----------------------- find_substr -----------------------------------
// for now, fin the longest matching string.
// for future, may want to consider sorting the dictionary in 
//  popularity order. and simply use the first one we find
// ----------------------------------------------------------------------
char * find_substr(char* str, StrTokArray &array, int adjust,
                   char** rx, char** endx)
{
  int   i;
  char *ptr;
  char *r, *end;
  r = end = NULL;

  *rx = *endx = NULL;
  char *longest=NULL;
  int   ln=0;
  
  for (i=0; i<array.Number(); i++)
    if ( (ptr = strstr(str, array[i][COL_ESP])) !=NULL )
     {
       int len = strlen(array[i][COL_ESP]);
       r   = ptr + len;
       end = str + strlen(str); 

       if ( (adjust==ADJ_NONE) || 
            (adjust==ADJ_LEFT  && ptr==str) ||
            (adjust==ADJ_RIGHT && r==end) )
        {
          if (len>ln)
           {
             longest = ptr;
             ln = len;
             *rx = r;
             *endx = end;
           }
        }
     }         

  MESG("   -- find: str:"<<str<<" adj:"<<adjust<<" found:"
         << (longest!=NULL) <<" longest:"<<longest
                            << "  rx:"<<*rx <<" endx:"<<*endx );
  
  return longest;
}


// ---------------- process_morphemes ---------------------------------------
// recursive
// word: can only be used as reference
// returns 1 if input string 'word' is completely parsed
// return 0 if not
// ----------------------------------------------------------------------
int process_morphemes(char buf[], char *word, StrTokArray t[], 
                      int cat, int adjust)
{
  MESG("  word:"<<word<<"("<<strlen(word)<<") cat:"<<cat<<"  adjust:"<<adjust);

  Str  word1(word);                                     // make a copy
  int   i;
  char *middle, *left, *right, *end, *str;

  left = str = word1;
  middle = find_substr(left, t[cat], adjust,  &right, &end);   // search

  Str left_s, middle_s, right_s;
  
  if (middle)                                           // prepare sub-words
   {
     if (right!=end) right_s = right;
     *right = 0;
     middle_s = middle;
     *middle = 0;
     if (left!=middle) left_s = left;
   }
  
  
  if (cat==CAT_ROOT && adjust==ADJ_NONE)  // --- starting point. Essential root
   {
     if (middle==NULL) return 0;
     
     if (left!=middle) 
       if (!process_morphemes(buf, left_s,t,CAT_ROOT,ADJ_RIGHT)) return 0;
     
     strcat(buf, middle_s); strcat(buf, " "); 
     
     if (right!=end) 
       if (!process_morphemes(buf, right_s,t,CAT_ROOT,ADJ_LEFT)) return 0;

     return 1;
   }

  else if (cat==CAT_ROOT && adjust==ADJ_RIGHT)     // compound word: left root
   {
     if (middle==NULL) return process_morphemes(buf, str,t,CAT_PREF,ADJ_RIGHT);
      
     if (left!=middle)
       if (!process_morphemes(buf, left_s,t,CAT_ROOT,ADJ_RIGHT)) return 0;
     strcat(buf, middle_s); strcat(buf, " + "); 
     return 1;
   }
  
  else if (cat==CAT_PREF && adjust==ADJ_RIGHT)    // prefix at left side
   {
     if (middle==NULL) return 0;

     if (left!=middle)
       if (!process_morphemes(buf, left_s,t,CAT_PREF,ADJ_RIGHT)) return 0;
     strcat(buf, middle_s); strcat(buf, "+ "); 
     return 1;
   }
        
  else if (cat==CAT_ROOT && adjust==ADJ_LEFT)     // compound word: right root
   {
     if (middle==NULL) return process_morphemes(buf, str,t,CAT_SUFF,ADJ_LEFT);
      
     strcat(buf, "+ "); strcat(buf, middle_s); strcat(buf, " "); 
     if (right!=end)
       if (!process_morphemes(buf, right_s,t,CAT_ROOT,ADJ_LEFT)) return 0;
     return 1;
   }
                
  else if (cat==CAT_SUFF && adjust==ADJ_LEFT)     // normal suffix at right
   {
     if (middle==NULL) return process_morphemes(buf, str,t,CAT_GSUF,ADJ_LEFT);

     strcat(buf, "+"); strcat(buf, middle_s); strcat(buf, " "); 
     if (right!=end)
       if (!process_morphemes(buf, right_s,t,CAT_SUFF,ADJ_LEFT)) return 0;
     return 1;
   }
        
  else if (cat==CAT_GSUF && adjust==ADJ_LEFT)    // grammatical suffix at right
   {
     if (middle==NULL) return 0;

     strcat(buf, "+"); strcat(buf, middle_s); strcat(buf, " "); 
     if (right!=end)
       if (!process_morphemes(buf, right_s,t,CAT_GSUF,ADJ_LEFT)) return 0;
     return 1;
   }
   
  else if (cat==CAT_CORR && adjust==ADJ_LEFT)  // --- start pt for correlatives
   {
     if (middle==NULL) return 0;
     
     strcat(buf, middle_s); strcat(buf, " "); 
          
     if (right!=end) 
       if (!process_morphemes(buf, right_s,t,CAT_GSUF,ADJ_LEFT)) return 0;

     return 1;
   }
}


// ------------------ process_word ----------------------------------------
// str: string is not modified
// ----------------------------------------------------------------------
void process_word(StrTokArray t[], char * str)
{
  MESG("\n-- "<<str); 

  StrTok word(str);
  char   buf[300];
  
  if (t[CAT_SELF].Find( word )!=COLLECT_INVALID_INDEX) 
   {
     cout <<str<<' ';
   }
  else if (buf[0]=0, process_morphemes(buf, str, t, CAT_CORR, ADJ_LEFT)) 
   {
     cout <<buf ; 
   }
  else if (buf[0]=0, process_morphemes(buf, str, t, CAT_ROOT, ADJ_NONE))
   {
     cout <<buf ; 
   }
  else cerr <<"Warning, the word <"<<str<<"> cannot be parsed"<<endl;
}


// ---------------------- main -------------------------------------
// ----------------------------------------------------------------------
int main()
{
  int i;
  StrTokArray categs[N_CATEGS];

  fstream input(CC_VORTARO, ios::in);
  if (!input) { cerr <<"Error, can't find "<<CC_VORTARO<<endl; exit(1); }

  for (i=0; i<N_CATEGS; i++)                       // read lexicon entries
    input >> categs[i]; 
  input.close();  

  IFDEBUG 
   {
     for (i=0; i<N_CATEGS; i++)
       cout << categs[i] <<endl;
   }
  
  char buf[LISP_BUF_SIZE+3];
  char c; 
  StrTok original(NULL, ORIG_BUF_SIZE);
    
  while (1)
   {
     while (c=cin.get(), check_punc(c) && !cin.eof())   // read off puncs
        original.add(print_punc(c));
     
     if (cin.eof()) break;
     
     i=0;
     do buf[i++] = c;                                   // retrieve word
     while (c=cin.get(), !check_punc(c) && !cin.eof()); 
     
     if (cin.eof()) break;
     
     buf[i] = 0;
     process_word(categs, buf);                         // process word
     original.add(buf);                                 // add to original list
     
     original.add(print_punc(c));
   }
  cout <<endl;
  return 0;
}


