#! /usr/bin/perl -w
#
# This script replaces bases from nucleotides in a .pdb with three
# pseudo atoms named PL1, PL2, PL3 that determine the base plane 
# information
#
# usage:
#  ./replace_base_by_plane.pl A_with_base.pdb > A_with_plane.pdb
#
# A_with_base.pdb  : pdb containing 'full' nucleotides
# A_with_plane.pdb : pdb containing 'nucleotide base plane only'
#                    plus first group nitrogen ( GUA N9, ADE N9, 
#                                                THY N1, CYT N1 )
#
# Copyright (c) 2007 Peter Minary

  $pdb_file = $ARGV[0];
  open(PDB,"$pdb_file");

# read in .pdb

  $natom = 0;
  while (<PDB>) {
      chomp;
      $entry = $_;
      if (substr($entry,0,4) eq 'ATOM') {
         @list_entry = split;
         $atom = $list_entry[2];
         $altLoc = substr($entry,16,1);
         if ($altLoc eq ' ' || $altLoc eq 'A') {
             $natom++;
             $atom[$natom]       = substr($entry,12,4); $atom[$natom]    =~ tr/ //d;
             $resName[$natom]    = substr($entry,17,3); # $resName[$natom] =~ tr/ //d;
        if ($resName[$natom] eq "ADE") {$resName[$natom] = " A ";}
        if ($resName[$natom] eq "THY") {$resName[$natom] = " T ";}
        if ($resName[$natom] eq "GUA") {$resName[$natom] = " G ";}
        if ($resName[$natom] eq "CYT") {$resName[$natom] = " C ";}
             substr($entry,17,3) = $resName[$natom];
             $resName[$natom] =~ tr/ //d;
             $chainID[$natom]    = substr($entry,21,1); $chainID[$natom] =~ tr/ //d;
             $resSeq[$natom]     = substr($entry,22,4); $resSeq[$natom]  =~ tr/ //d;
# NOUSE
#             $char_first[$natom] = substr($atom,0,1);
# NOUSE
             $x[$natom]     = substr($entry,30,8); $x[$natom] =~ tr/ //d;
             $y[$natom]     = substr($entry,38,8); $y[$natom] =~ tr/ //d;
             $z[$natom]     = substr($entry,46,8); $z[$natom] =~ tr/ //d;
             $entry[$natom] = $entry;
         }
#DEBUG
# print "$entry\n";
#DEBUG
      }
#NOUSE
#      if (substr($entry,0,3) eq 'END' || substr($entry,0,3) eq 'TER') {
#          last;
#      }
#NOUSE
  }

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ separate atoms into residues ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  $nres   = 1;
  $resSeq = $resSeq[1];
  $natom_res[$nres] = 0;
  for ($iatom=1;$iatom<=$natom;$iatom++) {
       if ($resSeq[$iatom] ne $resSeq) {
           $resSeq = $resSeq[$iatom];
           $nres++;
           $natom_res[$nres]++;
           $ind_atom[$nres][$natom_res[$nres]] = $iatom;
       } else {
           $natom_res[$nres]++;
           $ind_atom[$nres][$natom_res[$nres]] = $iatom;  
       }
  }

#~~~~~~~~~~~~~~~~~~~~~~~~~ separate residues into molecules based on chainID ~~~~~~~~~~~~~~~~~~

  $nmol = 1;
  $chainID = $chainID[1];
  $nres_mol[$nmol] = 0;
  for ($ires=1;$ires<=$nres;$ires++) {
       $atom1 = $ind_atom[$ires][1];
#DEBUG
#  print "$chainID $chainID[$atom1]\n";
#  print "$entry[$atom1]\n";
#DEBUG
       if ($chainID[$atom1] ne $chainID) {
           $chainID = $chainID[$atom1];
           $nmol++;
           $nres_mol[$nmol]++;
           $ind_res[$nmol][$nres_mol[$nmol]] = $ires;
       } else {
           $nres_mol[$nmol]++;
           $ind_res[$nmol][$nres_mol[$nmol]] = $ires;
       }
  }

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~ identify molecules as DNA or protein  ~~~~~~~~~~~~~~~~~~~~~~~~~~

   $nDNA = 0;
   for ($imol=1;$imol<=$nmol;$imol++) {
        $ires    = $ind_res[$imol][1];
        $iatm    = $ind_atom[$ires][1];
        $resName = $resName[$iatm];
        $length  = length($resName);
        if ($length == 3) { $ityp_mol[$imol] = 'protein'; }
        if ($length  < 3) { $ityp_mol[$imol] = 'DNA'; }
#DEBUG
#        print "$ityp_mol[$imol] \n";
#DEBUG
        if ($ityp_mol[$imol] eq 'DNA') {
            $nDNA++;
            $ind_DNA[$nDNA] = $imol;
        }
        if ($ityp_mol[$imol] eq 'protein') {
#            $ind_prot       = $imol;
        }
   }
   if ($nDNA != 2)
      { die "The number of DNA chains is not 2";}
   if ($nres_mol[$ind_DNA[1]] != $nres_mol[$ind_DNA[2]])
      { die "DNAs do not have the same # of residues\n";}

#DEBUG
# print "ind_DNA[1] $ind_DNA[1]\n";
#DEBUG

#~~~~~~~~~~~ find the indices of base/sidechain and backbone atoms   ~~~~~~~~~~~~~~~~~~
# $ind_atom_back[$ires][$iatm]  :  backbone atoms for $ires in protein/DNA
# $natom_back[$ires]            :  # of backbone atoms for $ires in protein/DNA
# $ind_atom_side[$ires][$iatm]  :  sidechain/base atoms for $ires in protein/DNA
# $natom_side[$ires]            :  # of sidechain/base atoms for $ires in protein/DNA
  
    @prot_back = ("C", "O", "CA", "HA", "CB", "N", "HN", "HA1", "HA2");
    @nucl_back = ("C1'", "C2'", "C3'", "C4'", "O4'", "H1'", "H2'", "H2''",
                  "H3'", "O3'", "H4'", "C5'", "O5'", "H5'", "H5''", "P",
                  "O1P", "O2P", "O2'" );
    @base_plane = ("C2", "C6", "C4" );

    @n1_n9   = ("N1", "N9");

#DEBUG
#    $nucl_back = @nucl_back;
#    print "$nucl_back\n";
#    print "$nucl_back[3]\n"; <STDIN>;
#DEBUG
    
     $prot_back  = @prot_back;
     $nucl_back  = @nucl_back;
     $base_plane = @base_plane;
     $n1_n9      = @n1_n9;

     for ($ires=1;$ires<=$nres;$ires++) {
 	  $natom_back[$ires] = 0;
          $natom_side[$ires] = 0;
          $natom_plane[$ires] = 0;
          $natom_n1_n9[$ires] = 0;

          for ($iatm=1;$iatm<=$natom_res[$ires];$iatm++) {
               $iatom = $ind_atom[$ires][$iatm];
               $backbone = 0;
               $base_plane_atom = 0;
               $n1_n9_found  = 0;
	       for ($katm=0;$katm<$prot_back;$katm++) {
		   if ($atom[$iatom] eq $prot_back[$katm]) {$backbone = 1;}
               }
	       for ($katm=0;$katm<$nucl_back;$katm++) {
		   if ($atom[$iatom] eq $nucl_back[$katm]) {$backbone = 1;}
               }
	       for ($katm=0;$katm<$base_plane;$katm++) {
		   if ($atom[$iatom] eq $base_plane[$katm]) 
                                                    {$base_plane_atom = 1;}
               }
               for ($katm=0;$katm<$n1_n9;$katm++) {
		   if ($atom[$iatom] eq $n1_n9[$katm]) {$n1_n9_found = 1;}
               }
 
               if ($backbone==1) {
                   $natom_back[$ires]++;
                   $ind_atom_back[$ires][$natom_back[$ires]] = $iatom;
               }
               if ($backbone==0) {
                   $natom_side[$ires]++;
                   $ind_atom_side[$ires][$natom_side[$ires]] = $iatom;
               }
               if ($base_plane_atom == 1) {
                   $natom_plane[$ires]++;
                   $ind_atom_plane[$ires][$natom_plane[$ires]] = $iatom;
               }
               if ($n1_n9_found==1) {
                   $natom_n1_n9[$ires]++;
                   $ind_atom_n1_n9[$ires][$natom_n1_n9[$ires]] = $iatom;
               }
               

#DEBUG
#	       print "$entry[$iatom]\n";
#DEBUG
          }
#DEBUG
#          print "ires $ires\n";
          for ($iatm=1;$iatm<=$natom_side[$ires];$iatm++) {
	       $iatom = $ind_atom_side[$ires][$iatm];
#               print "$entry[$iatom]\n"
          }
#          <STDIN>;
#DEBUG
     }

#~~~~~~~~~~~ print any molecules but for nucleotides only the backbone ~~~~~~~~~~~~~~~~~~

     for ($imol=1;$imol<=$nmol;$imol++) {
        if ($ityp_mol[$imol] eq 'DNA') {          
#DEBUG
#    print ("DNA\n");
#    <STDIN>;
#DEBUG
            $iires = $ind_res[$imol][1];
            $iatom   = $ind_atom_back[$iires][1];
            $chainID = $chainID[$iatom];
#            print "SEQ $chainID >"; 
            for ($ires=1;$ires<=$nres_mol[$imol];$ires++) {
                 $iires = $ind_res[$imol][$ires];
                 $iatom   = $ind_atom_back[$iires][1];
                 $resName = $resName[$iatom];
#                 print "$resName"; 
            }#endfor#       
#            print "\n";
	    for ($ires=1;$ires<=$nres_mol[$imol];$ires++) {
                 $iires = $ind_res[$imol][$ires];
                 for ($iatm=1;$iatm<=$natom_back[$iires];$iatm++) {
                      $iatom = $ind_atom_back[$iires][$iatm];
                      print "$entry[$iatom]\n";
	         }#endfor#
		 for ($iatm=1;$iatm<=$natom_plane[$iires];$iatm++) {
		      $iatom = $ind_atom_plane[$iires][$iatm];
                      $entry_base = $entry[$iatom];
                      $atom_base       = "PLN";
                      $atom_base_field = sprintf("%-4s",$atom_base);
                      substr($entry_base,12,4) = $atom_base_field;
                      print "$entry_base\n";
                 }#endfor#
		 for ($iatm=1;$iatm<=$natom_n1_n9[$iires];$iatm++) {
		      $iatom = $ind_atom_n1_n9[$iires][$iatm];
                      if ( (substr($resName[$iatom],0,1) eq 'A') && 
                           ($atom[$iatom] eq 'N1' ) )  {next;}
                      if ( (substr($resName[$iatom],0,1) eq 'G') && 
                           ($atom[$iatom] eq 'N1' ) )  {next;}
                      $entry_base = $entry[$iatom];
                      $atom_base       = "NN";
                      $atom_base_field = sprintf("%-4s",$atom_base);
                      substr($entry_base,12,4) = $atom_base_field;
                      print "$entry_base\n";
                 }#endfor#
#DEBUG
#                 <STDIN>;
#DEBUG
            }#endfor#
        } else {
	    for ($ires=1;$ires<=$nres_mol[$imol];$ires++) {
                 $iires = $ind_res[$imol][$ires];
                 for ($iatm=1;$iatm<=$natom_res[$iires];$iatm++) {
                      $iatom = $ind_atom[$iires][$iatm];
#                      print "$entry[$iatom]\n";
	         }#endfor#
#DEBUG
#                 <STDIN>;
#DEBUG
            }#endfor#
	}#endif#
     }#endfor#
