#!/usr/bin/env perl
use warnings;
use strict;
use utf8;
BEGIN {
	$| = 1;
	binmode(STDIN, ':encoding(UTF-8)');
	binmode(STDOUT, ':encoding(UTF-8)');
}
use open qw( :encoding(UTF-8) :std );

use Getopt::Long;
Getopt::Long::Configure('no_ignore_case');

my %opts = ();
GetOptions (\%opts,
   'help|?',
   'grammar|g=s',
   'output|o',
   'strip',
   'secondary',
   'regex',
   'icase',
   'baseforms',
   'wordforms',
   'all',
   );

sub print_help {
   print <<'XOUT';
Usage: cg-strictify [OPTIONS] <grammar>

Options:
 -?, --help       outputs this help
 -g, --grammar    the grammar to parse; defaults to first non-option argument
 -o, --output     outputs the whole grammar with STRICT-TAGS
     --strip      removes superfluous LISTs from the output grammar; implies -o
     --secondary  adds secondary tags (<...>) to strict list
     --regex      adds regular expression tags (/../r, <..>r, etc) to strict list
     --icase      adds case-insensitive tags to strict list
     --baseforms  adds baseform tags ("...") to strict list
     --wordforms  adds wordform tags ("<...>") to strict list
     --all        same as --strip --secondary --regex --icase --baseforms --wordforms

XOUT
}

if (defined $opts{'help'}) {
   print_help();
   exit(0);
}
if (!defined $opts{'grammar'}) {
   if (!$ARGV[0]) {
      print "Missing input grammar argument!\n\n";
      print_help();
      exit(-1);
   }
   $opts{'grammar'} = $ARGV[0];
}
if (! -s $opts{'grammar'}) {
   print "Grammar is either missing or empty!\n";
   exit(-1);
}

if (defined $opts{'strip'}) {
   $opts{'output'} = 1;
}

if (defined $opts{'all'}) {
   $opts{'secondary'} = 1;
   $opts{'regex'} = 1;
   $opts{'icase'} = 1;
   $opts{'baseforms'} = 1;
   $opts{'wordforms'} = 1;
}

my $tags = `vislcg3 --show-tags -g '$opts{grammar}' | LC_ALL=C sort`;
$tags =~ s@[\r\n]+@\n@g; # Normalize newlines
chomp($tags);
my @tags = split /\n/, $tags;

my $options = '';
if (defined $opts{'secondary'} || defined $opts{'regex'} || defined $opts{'icase'} || defined $opts{'baseforms'} || defined $opts{'wordforms'}) {
   $options .= 'OPTIONS +=';
   if (defined $opts{'secondary'}) {
      $options .= ' strict-secondary';
   }
   if (defined $opts{'regex'}) {
      $options .= ' strict-regex';
   }
   if (defined $opts{'icase'}) {
      $options .= ' strict-icase';
   }
   if (defined $opts{'baseforms'}) {
      $options .= ' strict-baseforms';
   }
   if (defined $opts{'wordforms'}) {
      $options .= ' strict-wordforms';
   }
   $options .= " ;\n";
}

my $strict_tags = 'STRICT-TAGS +=';
my @strict = ();
my $lf = '';
foreach my $tag (@tags) {
   if ($tag eq '*') {
      next; # Marker for erasure, any, etc.
   }
   if ($tag eq '>>>') {
      next; # Start of window marker
   }
   if ($tag eq '<<<') {
      next; # End of window marker
   }
   if ($tag =~ m@^\^@) {
      next; # Fail-fast tags
   }
   if ($tag =~ m@^VSTR:@) {
      next; # Varstrings
   }
   if ($tag =~ m@^VAR:@) {
      next; # Global variables
   }
   if ($tag =~ m@^_.+_$@) {
      next; # Magic placeholders such as _TARGET_
   }
   if ($tag =~ m@dummy string@) {
      next; # Internal placeholder for tag #0
   }

   if ($tag =~ m@[>"/]v$@) {
      next;
   }
   if (! defined $opts{'secondary'} && $tag =~ m@^<.+>[riv]*$@) {
      next;
   }
   if (! defined $opts{'regex'} && $tag =~ m@^/.+/[riv]*$@) {
      next;
   }
   if (! defined $opts{'icase'} && $tag =~ m@[>"/]i$@) {
      next;
   }
   if (! defined $opts{'baseforms'} && $tag =~ m@^"[^<].*"[riv]*$@) {
      next;
   }
   if (! defined $opts{'wordforms'} && $tag =~ m@^"<.*>"[riv]*$@) {
      next;
   }

   my $nlf = substr($tag, 0, 1);
   if ($nlf =~ m@^\p{Lu}@) {
      $nlf = 'A';
   }
   elsif ($nlf =~ m@^\p{Ll}@) {
      $nlf = 'a';
   }
   elsif ($tag =~ m@^"<.+>"[riv]*$@) {
      $nlf = 'W';
   }
   elsif ($tag =~ m@^".+"[riv]*$@) {
      $nlf = 'B';
   }
   if ($lf && $lf ne $nlf) {
      $strict_tags .= "\n";
   }
   $lf = $nlf;

   $strict_tags .= " $tag";
   push(@strict, $tag);
}
$strict_tags .= " ;\n";

if (! defined $opts{'output'}) {
   print "Put these lines at the top of your grammar, and edit the STRICT-TAGS list by removing invalid tags:\n\n";
   print $options;
   print $strict_tags;
   exit(0);
}

my $g = '';
{
   local $/ = undef;
   open(FH, '<'.$opts{'grammar'});
   $g = <FH>;
   close(FH);
}

if (defined $opts{'strip'}) {
   print STDERR "Deleting unnecessary LISTs...\n";
   foreach my $tag (@strict) {
      if ($g =~ m@[\b\n][Ll][Ii][Ss][Tt]\s+\Q$tag\E\s*=\s*\Q$tag\E\s*;@) {
         print STDERR "Deleting LIST $tag\n";
         $g =~ s@[\b\n][Ll][Ii][Ss][Tt]\s+\Q$tag\E\s*=\s*\Q$tag\E\s*;@@;
      }
   }
}

print $options;
print $strict_tags;
print "\n";
print $g;
