#!/usr/bin/perl -w
#
# cooccur_scan.pl
# ---------------
# Takes input from a file with multiple motif occurrences and identifies regions of 
# co-occurrence according to the criteria specified by the 
# user.  
#
# Can also assign locations if provided with annotation tables in appropriate format.
#
# 
# Copyright (c) 2001 by Yonatan Grad and the President and Fellows of Harvard University
# 
# Created, updated, and re-written 9/2000-12/2001
#

AnalyzeParameters(@ARGV);

process_scn();

# sort the records
foreach $sort (sort {$a->{LOCATION} <=> $b->{LOCATION}} values %bylocation) {
    push (@sorted, $sort);
}

# clear the holders
for ($i=1; $i<$setmotifs; $i++){
    $hit[$i] = 0;
}

$nohit=$found=0;

# loop through the number of motifs
for ($num = 0; $num < $#sorted; $num++){

    # introduce the motif location into the @frame window
    push (@frame, $sorted[$num]);

    # increase count of motif corresponding to newest element
    for ($i = 1; $i < $setmotifs; $i++){
        if ($frame[$#frame]->{MOTIF} =~ /n$i/){
            $hit[$i]++;
            $report[$i]++;
            $lastelem = $frame[$#frame];
        }
    }

    # shift the window as appropriate
    while ($frame[$#frame]->{LOCATION}>($window+($frame[0]->{LOCATION}))){
        # drop first element of shifting frame
        $firstelem = shift(@frame); 
        # lower count of appropriate motif
 
        for ($i = 1; $i < $setmotifs; $i++){
            if ($firstelem->{MOTIF} =~ /n$i/){
                $hit[$i]--; 
            }
        }

    }

    # zero nohit parameter to check for all five motifs 
    $nohit = 0;

    # check if all desired motifs are present
    for ($x=1; $x < $setmotifs; $x++){
	if(exists ($motifs[$x])){
            # make nohit positive if at least one of
            # the motifs is absent
            if (!($hit[$x])){
                $nohit++;
                    if ($found) {
                        $difference = $frame[$#frame-1]->{LOCATION}-$startpt;
                        printf "Motif density is %d / %d\n", $densitycount, 
                               ($frame[$#frame-1]->{LOCATION} - $startpt);
                        $" = "|";
                        print "$startpt|$difference|$densitycount|";

			if ($annot){
 			  give_location($startpt);
			  print O "$startpt|$difference|$densitycount|";
			}

                        # decrement the last entry into the report array
                        # so it does not include the next motif

                        for ($l = 1; $l < $setmotifs; $l++){
                            if ($lastelem->{MOTIF} =~ /n$l/){
                                $report[$l]--;
                            }           
                            print "$report[$l]|";
			    print O "$report[$l]|" if ($annot);
			    
                        }
			print "\n";
			print O "\n" if ($annot);
			
                    }
                $found = 0;
                @report = ();
            }
	}
    }

    # if nohit is 0 and found is TRUE then print results for only the current motif
    # being checked (it will appear as an addition to the other results)
    if ((!($nohit)) && ($found)){
        print 
            $frame[$#frame]->{MOTIF}."\t".
            $frame[$#frame]->{LOCATION}."\t".
            $frame[$#frame]->{SEQ}."\t".
            $frame[$#frame]->{ORIENT}."\n";
        $densitycount++;
    }

    # if nohit is 0, ie if all motifs present,
    # then print results and set found variable to
    # true
    if ((!($nohit)) && (!($found))){
        $found = 1;
        print "\nputative enhancer starting at $frame[0]->{LOCATION}\n";
        print
            $_->{MOTIF}."\t".
            $_->{LOCATION}."\t".
            $_->{SEQ}."\t".
            $_->{ORIENT}."\n" foreach @frame;
        $densitycount = $#frame + 1;
        @report = @hit;
        $startpt = $frame[0]->{LOCATION};
    }

}

sub give_location {
    if ($_[0] >= $srt[$#srt]->{END}){
        $entryA = $srt[$#srt];
        if ($entryA->{STRAND} eq "+"){
            printf O "found %d bp downstream of $entryA->{NAME}|", 
                ($_[0] - $entryA->{END});
            print O "$entryA->{NAME}|0|";
        } else {
            printf O "found %d bp upstream of $entryA->{NAME}|", 
                ($_[0] - $entryA->{END});
            print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|";
        }
    } elsif ((
        ($srt[0]->{STRAND} eq "+") && ($_[0] <= $srt[0]->{START}))
        || (($srt[0]->{STRAND} eq "-") && ($_[0] <= $srt[0]->{END}))){
        $entryA = $srt[0];
        if ($entryA->{STRAND} eq "+"){
            printf O "found %d bp upstream of $entryA->{NAME} exon $entryA->{EXON}|", 
                ($entryA->{START}-$_[0]);
            print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|";
        } else {
            printf O "found %d bp downstream of $entryA->{NAME} exon $entryA->{EXON}|", 
                ($entryA->{END}-$_[0]);
            print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|";
        }
            
    } else {    


        for ($x = 0; $x < $#srt; $x++){
            $entryA = $srt[$x];
            $entryZ = $srt[$x-1];
            last if ($entryA->{START} > $_[0]) || ($entryA->{END} > $_[0]);
        }

        if (($entryA->{END} >= $_[0])&&($entryA->{START} <= $_[0])){
            print O "found within $entryA->{NAME} exon $entryA->{EXON}|";
            print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|";
        }

        if ($entryA->{START} > $_[0]){
            printf O "found %d bp",
                ($_[0] - $entryZ->{END});
            if ($entryZ->{STRAND} eq "+"){
                print O " downstream ";
            } else {
                print O " upstream ";
            }
            print O "of $entryZ->{NAME} exon $entryZ->{EXON} ";

            printf O "and %d bp",
                ($entryA->{START} - $_[0]);
            if ($entryA->{STRAND} eq "+"){
                print O " upstream ";
            } else {
                print O " downstream ";
            }
            print O "of $entryA->{NAME} exon $entryA->{EXON}|";

            print O "$entryZ->{NAME}|$entryZ->{ACCN}|$entryA->{NAME}|$entryA->{ACCN}|";
        }
    }    
}

sub process_scn {
    my @tab;
    @tab = ();

    while ($line = <FILE>){ 
        chomp($line);
        @tab = split(/\s+/, $line); 
	

        if (scalar(@tab) < 6){
	  print "\nError:\nInput file is not in the proper format\n";
	  input_format();
	  exit;
	} elsif ($tab[1]!~/\d+/){
	  print "\nError:\nIn input file: 2nd field must be numeric (location of a motif)\n";
	  input_format();
	  exit;
	} elsif ($tab[3]!~/[ACGTNacgtn]/){
	  print "\nError:\nIn input file: 4th field must contain only A, C, T, G, or N (motif sequence)\n";
	  input_format();
	  exit;
	} elsif ($tab[2]!~/1/&&$tab[2]!~/0/){
	  print "\nError:\nIn input file: 3rd field must be either a 0 or 1 (strand of motif; 1 is plus strand, 0 is minus)\n";
	  input_format();
	  exit;
	} elsif ($tab[5]!~/n\d/){
	  print "\nError:\nIn input file: 6th field must be a motif designation in the form n# (e.g. n1)\n";
	  input_format();
	  exit;
	}
        $record = {
            LOCATION => $tab[1],
            SEQ      => $tab[2],
            ORIENT   => $tab[3],
            MOTIF    => $tab[5]
        };
        $bylocation { $record->{LOCATION} } = $record;
    }
} 

sub process_annot {
  my @tab;

  while ($line = <ANNOT>){
    chomp $line;
    @tab = split(/\s+/, $line);

       
    if ($line=~/\>/){
      next;
    } elsif (scalar(@tab) < 9) {
      print "\nError:\nAnnotation file is not in the proper format\n";
      Format();
      exit;
    }
   
    if ($tab[2] eq "" || $tab[2]!~/^\d+$/){
      print "\nError:\nIn annotation file: 3rd field must be numeric (coding start site)\n";
      Format();
      exit;
    }
    
    if ($tab[3] eq "" || $tab[3]!~/^\d+$/){
      print "\nError:\nIn annotation file: 4th field must be numeric (coding end site)\n";
      Format();
      exit;
    }

    if ($tab[4] eq "" || $tab[4]!~/[\+\-]/){
      print "\nError:\nIn annotation file: 5th field must be either + or - (strand of coding seq)\n";
      Format();
      exit;
    }

    if ($tab[5] eq ""){
      print "\nError:\nIn annotation file: 6th field must specify the gene name\n";
      Format();
      exit;
    }

    if ($tab[8] eq "" || $tab[8]!~/^\d+$/){
      print "\nError:\nIn annotation file: 9th field must be numeric (coding exon number)\n";
      Format();
      exit;
    }

    if ($tab[6] eq ""){
      print "\nError:\nIn annotation file: 7th field must specify the accession name\n";
      Format();
      exit;
    }

    $recordA = {
      START  => $tab[2],
      END    => $tab[3],
      STRAND => $tab[4],
      NAME   => $tab[5],
      ACCN   => $tab[6],
      EXON   => $tab[8]
    };
    $bystart {$recordA->{START}} = $recordA;
  }
  foreach $sortstart (sort {$a->{START} <=> $b->{START}} values %bystart){
    push(@srt, $sortstart);
  }

  close ANNOT;

  open (O, ">$infile.$window.list") or die "Can't open annotation-based output file: $!\n";

}

sub AnalyzeParameters {

  my $i;
  $infile = "";
  $window = "";
  $annot = "";
  $check = "";
  $setmotifs = "";  

  if (@_ == 0) {
    Usage();
    exit;
  }

  for ($i=0;$i<@_;++$i) {
    if ($_[$i] eq "-i") {
      $infile = $_[$i+1];
      ++$i;
    }
    elsif ($_[$i] eq "-w") {
      $window = $_[$i+1];
      $i++;
    }
    elsif ($_[$i] eq "-m") {
      $setmotifs = $_[$i+1];
      $i++;
    }
    elsif ($_[$i] eq "-c") {
      $check = $_[$i+1];
      $i++;
    }
    elsif ($_[$i] eq "-a") {
      $annot = $_[$i+1];
      $i++;
    }
  }

  if ($infile eq "") {
    die "Missing required parameter -i (input file).\n";
  }
  if (! -e $infile) {
    die "Input file $infile (-i) does not exist.\n";
  }
  open (FILE, "$infile") or die "Can't open $infile: $!\n";
  
  if ($window eq "") {
    die "Must specify search window (-w).\n";
  }
  if ($setmotifs eq "") {
    die "Must specify number of motifs (-m).\n";
  }
  if ($check eq ""){
    die "Must specify which motifs to check (-c).\n";
  }
  if ($annot eq ""){
    print "No annotations specified... Continuing with analysis.\n";
  } else {
    print "Uploading from annotation file $annot.\n";
    open (ANNOT, "$annot") or die "Can't open $annot: $!\n";
    process_annot();
    print "Done uploading annotations.\n";
  }

  if ($window =~/^\d+$/){} else {
    die "Search window $window (-w) must be numeric.\n";
  }
  if ($setmotifs =~/^\d+$/){
    $setmotifs++;
  } else {
    die "Number of motifs $setmotifs (-m) must be numeric.\n";
  }
  if ($check=~/^[\d\,]+$/){
    @ch = split (/\,/, $check);
    $chnum = scalar (@ch);
    if (($chnum+1)>$setmotifs){
      die "Number of motifs to check (-c) MUST be less than or equal to total number of motifs (-m).\n";
    } else {
      print "$chnum motifs designated: ";
      print "$_ " foreach (@ch);
      for ($j=0;$j<$setmotifs;$j++){
        foreach (@ch){
          if ($_==$j){
	    $motifs[$_] = 1;
	  }
	}
      }
      print "\n";
    }
  } else {
    die "Motifs to check must be designated as follows: <motif num>,<motif num>, etc.\n";
  }
}

sub Usage {
  print <<_HELP

cooccur_scan.pl

Usage:
perl cooccur_scan.pl -i -m -w -c -a
  -i: input file (required)
  -m: number of motifs to be searched for (required)
  -w: search window in which to look for co-occurrence of all motifs (required)
  -c: motifs to be checked, using the format: <motif num>,<motif num>,<etc.>
  -a: annotation file (optional)

_HELP
  ;
}

sub Format {
  print <<_FORM

Annotation files must contain one line per annotation, and each line must have at least 
9 WHITESPACE delimited fields, with the following fields set as indicated:

  Field 3: start bp of coding sequence (must be numeric and less than entry in Field 4)
  Field 4: end bp of coding sequence (must be numeric and greater than entry in Field 3)
  Field 5: strand (either a + or -)
  Field 6: gene name
  Field 7: accession number (e.g., Flybase ID)
  Field 9: exon number (must be numeric)

NOTE: no header lines are allowed in the annotation files.

_FORM
  ;
}

sub input_format {
  print <<_INFORMAT

Input files contain the results of a DNA sequence scan for specific motifs.  These files
will be used to search for co-occurrences of motifs according to the parameters specified
on the command line.  The input file must be a concatenation of all the motif occurrences
for all the motifs to be used in the search, with one line per single motif occurrence.
The following fields must be set as indicated:

  Field 2: location of motif
  Field 3: motif sequence (only A's, C's, G's, T's, and N's allowed)
  Field 4: strand of motif (0 indicates minus strand, 1 indicates plus strand)
  Field 6: motif designation (must be in the format n#, e.g. n1)

Example:

test_sequence_1		2914	1	ACTGACTGAC	15.9638	n1
test_sequence_1		5500	1	ACTGACGTAC	15.9638 n1
test_sequence_1		8364	0	ACTGACTGAC	15.9638 n1
test_sequence_1		465	1	GGAGCAG		8.0235	n2
test_sequence_1		7465	1	GCAGCAG		8.0235	n2
test_sequence_1		931	0	TACCAGGA	9.4333	n3
test_sequence_1		6931	0	TACCTGGA	9.4333	n3
test_sequence_1		10931	0	AACCAGGG	8.7445	n3

_INFORMAT
  ;
}