#!/usr/bin/perl

use strict;
use warnings;

use Chemistry::OpenSMILES::Stereo qw( mark_all_double_bonds );
use Chemistry::OpenSMILES::Writer qw( write_SMILES );
use File::Basename qw( basename );
use Getopt::Long::Descriptive;
use Graph::Traversal::DFS;
use Graph::Undirected;
use List::Util qw( first );
use XML::LibXML;
use XML::LibXML::XPathContext;

my $basename = basename $0;
my( $opt, $usage ) = describe_options( <<"END" . 'OPTIONS',
USAGE
    $basename [<args>] [<files>]

DESCRIPTION
    $basename converts CML files into SMILES.

END
    [ 'enable-network-access',
      'allow XML parser to access network to fetch remote content' ],
    [],
    [ 'help', 'print usage message and exit', { shortcircuit => 1 } ],
);

if( $opt->help ) {
    print $usage->text;
    exit;
}

@ARGV = ( '-' ) unless @ARGV;

for my $filename (@ARGV) {
    my $cml = XML::LibXML->load_xml( location => $filename,
                                     no_network => !$opt->enable_network_access );

    my $xp = XML::LibXML::XPathContext->new( $cml );
    $xp->registerNs( 'cml', 'http://www.xml-cml.org/schema' );

    my @graphs;
    for my $molecule ( $xp->findnodes( '/cml:cml/cml:molecule' ) ) {
        my $graph = Graph::Undirected->new( refvertexed => 1 );

        my( $atomArray ) = $molecule->getChildrenByTagName( 'atomArray' );
        next unless $atomArray; # Skip empty molecules

        my %vertex_by_atom_id;
        my %atom_id_by_vertex;
        my $n = 0;
        for my $atom ($atomArray->getChildrenByTagName( 'atom' )) {
            my $vertex = { number => $n++ };

            if( $atom->hasAttribute( 'elementType' ) ) {
                $vertex->{symbol} = $atom->getAttribute( 'elementType' );
            }
            if( $atom->hasAttribute( 'formalCharge' ) ) {
                $vertex->{charge} = int $atom->getAttribute( 'formalCharge' );
            }

            # Hydrogen count will be handled after bonds are added
            if( $atom->hasAttribute( 'hydrogenCount' ) ) {
                $vertex->{hydrogen_count} = $atom->getAttribute( 'hydrogenCount' );
            }

            my( $atomParity ) = $atom->getChildrenByTagName( 'atomParity' );
            if( $atomParity &&
                $atomParity->hasAttribute( 'atomRefs4' ) &&
                $atomParity->textContent =~ /^-?1$/ ) {
                my $chirality = int $atomParity->textContent;
                $vertex->{chirality} = $chirality == -1 ? '@' : '@@';

                # IDs will be replaced with real vertices in the second
                # pass, as some of vertices might not be created yet.
                $vertex->{chirality_neighbours} =
                    [ split ' ', $atomParity->getAttribute( 'atomRefs4' ) ];
            }

            $vertex_by_atom_id{$atom->getAttribute( 'id' )} = $vertex;
            $atom_id_by_vertex{$vertex} = $atom->getAttribute( 'id' );

            $graph->add_vertex( $vertex );
        }

        my @bonds;
        my( $bondArray ) = $molecule->getChildrenByTagName( 'bondArray' );
        if( $bondArray ) {
            @bonds = $bondArray->getChildrenByTagName( 'bond' );
        }

        # Pass through bonds to establish connectivity in the graph.
        for my $bond (@bonds) {
            my @vertices = map { $vertex_by_atom_id{$_} }
                               split ' ', $bond->getAttribute( 'atomRefs2' );
            $graph->add_edge( @vertices );

            my  $bond_type = bond_type( $bond->getAttribute( 'order' ) );
            if( $bond_type && $bond_type ne '-' ) {
                $graph->set_edge_attribute( @vertices, 'bond', $bond_type );
            }
        }

        # Pass through bonds to collect cis/trans bond settings.
        # TODO: Some CMLs have coordinates, thus additional cis/trans settings could be derived from them as well.
        my @cis_trans_settings;
        for my $bond (@bonds) {
            my( $stereo ) = $bond->getChildrenByTagName( 'bondStereo' );
            next unless $stereo;

            warn "wedge/hatch stereochemistry detected, not supported yet\n" if $stereo->hasAttribute( 'atomRefs2' );

            next unless $stereo->hasAttribute( 'atomRefs4' ) &&
                        $stereo->textContent =~ /^[CT]$/;

            # As per https://www.xml-cml.org/convention/molecular#bondStereo-atomRefs4, atoms here can be given in any order.
            my @atoms = map { $vertex_by_atom_id{$_} }
                            split ' ', $stereo->getAttribute( 'atomRefs4' );
            my $subgraph = $graph->subgraph( \@atoms );
            my $first = first { $subgraph->degree( $_ ) == 1 }
                        sort  { $a->{number} <=> $b->{number} } @atoms;
            @atoms = reverse Graph::Traversal::DFS->new( $subgraph, first_root => sub { $first } )->dfs;
            push @cis_trans_settings, [ @atoms, $stereo->textContent eq 'C' ? 'cis' : 'trans' ];
        }

        mark_all_double_bonds( $graph, \@cis_trans_settings );

        # A graph might describe more than one moiety. If so, it should be
        # split into components as Chemistry::OpenSMILES does not understand
        # not connected graphs.
        my @components;
        my $next_root =
            sub {
                my( $self, $unseen ) = @_;
                my $atom = next_vertex( $self, $unseen );
                return unless $atom;

                push @components, Graph::Undirected->new( refvertexed => 1 );
                # Adding the first atom here as this is the only way for
                # single-atom moieties to be represented.
                $components[-1]->add_vertex( $atom );
                return $atom;
            };
        my $register_edge =
            sub {
                my( $u, $v, $self ) = @_;
                my $graph = $self->graph;
                $components[-1]->add_edge( $u, $v );
                if( $graph->has_edge_attribute( $u, $v, 'bond' ) ) {
                    $components[-1]->set_edge_attribute( $u, $v, 'bond',
                                                         $graph->get_edge_attribute( $u, $v, 'bond' ) );
                }
            };
        my $dfs = Graph::Traversal::DFS->new(
                    $graph,

                    first_root     => $next_root,
                    next_root      => $next_root,
                    next_successor => \&next_vertex,

                    tree_edge      => $register_edge,
                    non_tree_edge  => $register_edge,
                  );
        $dfs->dfs;

        # Second pass through the atoms to set hydrogen counts and
        # chirality neighbours.
        # TODO: Some CMLs have coordinates, thus additional chirality
        # settings could be derived from them as well.
        for my $component (@components) {
            for my $atom ($component->vertices) {
                if( exists $atom->{hydrogen_count} ) {
                    my $hydrogen_count = grep { $_->{symbol} eq 'H' }
                                              $component->neighbours( $atom );
                    if( $hydrogen_count > $atom->{hydrogen_count} ) {
                        warn 'total number of attached hydrogen atoms is ' .
                             "less than the number of explicit hydrogen atoms\n";
                    }
                    while( $hydrogen_count < $atom->{hydrogen_count} ) {
                        $component->add_edge( $atom, { symbol => 'H',
                                                       number => $n++ } );
                        $hydrogen_count++;
                    }
                    delete $atom->{hydrogen_count};
                }

                if( exists $atom->{chirality_neighbours} ) {
                    $atom->{chirality_neighbours} =
                        [ map { $vertex_by_atom_id{$_} }
                              @{$atom->{chirality_neighbours}} ];

                    for (@{$atom->{chirality_neighbours}}) {
                        next if $component->has_edge( $atom, $_ );
                        warn "no edge between atoms '" .
                             $atom_id_by_vertex{$atom} . "' and '" .
                             $atom_id_by_vertex{$_} . "'\n";
                    }
                }
            }
        }

        push @graphs, @components;
    }

    print join( '.', map { write_SMILES( $_, { raw => 1 } ) } @graphs ), "\n";
}

sub bond_type
{
    my( $order ) = @_;

    return '-' if $order eq 'S' || $order eq '1';
    return '=' if $order eq 'D' || $order eq '2';
    return ':' if $order eq 'A';
    return '#' if $order eq 'T' || $order eq '3';

    warn "unknown bond type: '$order'\n";

    return;
}

sub next_vertex
{
    my( $self, $unseen ) = @_;
    return unless %$unseen;

    my( $next ) = map  { $unseen->{$_} }
                  sort { $unseen->{$a}{number} <=> $unseen->{$b}{number} }
                       keys %$unseen;
    return $next;
}
