#!/usr/bin/perl -w # # cooccur_scan.pl # --------------- # Takes input from a file with multiple motif occurrences and identifies regions of # co-occurrence according to the criteria specified by the # user. # # Can also assign locations if provided with annotation tables in appropriate format. # # # Copyright (c) 2001 by Yonatan Grad and the President and Fellows of Harvard University # # Created, updated, and re-written 9/2000-12/2001 # AnalyzeParameters(@ARGV); process_scn(); # sort the records foreach $sort (sort {$a->{LOCATION} <=> $b->{LOCATION}} values %bylocation) { push (@sorted, $sort); } # clear the holders for ($i=1; $i<$setmotifs; $i++){ $hit[$i] = 0; } $nohit=$found=0; # loop through the number of motifs for ($num = 0; $num < $#sorted; $num++){ # introduce the motif location into the @frame window push (@frame, $sorted[$num]); # increase count of motif corresponding to newest element for ($i = 1; $i < $setmotifs; $i++){ if ($frame[$#frame]->{MOTIF} =~ /n$i/){ $hit[$i]++; $report[$i]++; $lastelem = $frame[$#frame]; } } # shift the window as appropriate while ($frame[$#frame]->{LOCATION}>($window+($frame[0]->{LOCATION}))){ # drop first element of shifting frame $firstelem = shift(@frame); # lower count of appropriate motif for ($i = 1; $i < $setmotifs; $i++){ if ($firstelem->{MOTIF} =~ /n$i/){ $hit[$i]--; } } } # zero nohit parameter to check for all five motifs $nohit = 0; # check if all desired motifs are present for ($x=1; $x < $setmotifs; $x++){ if(exists ($motifs[$x])){ # make nohit positive if at least one of # the motifs is absent if (!($hit[$x])){ $nohit++; if ($found) { $difference = $frame[$#frame-1]->{LOCATION}-$startpt; printf "Motif density is %d / %d\n", $densitycount, ($frame[$#frame-1]->{LOCATION} - $startpt); $" = "|"; print "$startpt|$difference|$densitycount|"; if ($annot){ give_location($startpt); print O "$startpt|$difference|$densitycount|"; } # decrement the last entry into the report array # so it does not include the next motif for ($l = 1; $l < $setmotifs; $l++){ if ($lastelem->{MOTIF} =~ /n$l/){ $report[$l]--; } print "$report[$l]|"; print O "$report[$l]|" if ($annot); } print "\n"; print O "\n" if ($annot); } $found = 0; @report = (); } } } # if nohit is 0 and found is TRUE then print results for only the current motif # being checked (it will appear as an addition to the other results) if ((!($nohit)) && ($found)){ print $frame[$#frame]->{MOTIF}."\t". $frame[$#frame]->{LOCATION}."\t". $frame[$#frame]->{SEQ}."\t". $frame[$#frame]->{ORIENT}."\n"; $densitycount++; } # if nohit is 0, ie if all motifs present, # then print results and set found variable to # true if ((!($nohit)) && (!($found))){ $found = 1; print "\nputative enhancer starting at $frame[0]->{LOCATION}\n"; print $_->{MOTIF}."\t". $_->{LOCATION}."\t". $_->{SEQ}."\t". $_->{ORIENT}."\n" foreach @frame; $densitycount = $#frame + 1; @report = @hit; $startpt = $frame[0]->{LOCATION}; } } sub give_location { if ($_[0] >= $srt[$#srt]->{END}){ $entryA = $srt[$#srt]; if ($entryA->{STRAND} eq "+"){ printf O "found %d bp downstream of $entryA->{NAME}|", ($_[0] - $entryA->{END}); print O "$entryA->{NAME}|0|"; } else { printf O "found %d bp upstream of $entryA->{NAME}|", ($_[0] - $entryA->{END}); print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|"; } } elsif (( ($srt[0]->{STRAND} eq "+") && ($_[0] <= $srt[0]->{START})) || (($srt[0]->{STRAND} eq "-") && ($_[0] <= $srt[0]->{END}))){ $entryA = $srt[0]; if ($entryA->{STRAND} eq "+"){ printf O "found %d bp upstream of $entryA->{NAME} exon $entryA->{EXON}|", ($entryA->{START}-$_[0]); print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|"; } else { printf O "found %d bp downstream of $entryA->{NAME} exon $entryA->{EXON}|", ($entryA->{END}-$_[0]); print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|"; } } else { for ($x = 0; $x < $#srt; $x++){ $entryA = $srt[$x]; $entryZ = $srt[$x-1]; last if ($entryA->{START} > $_[0]) || ($entryA->{END} > $_[0]); } if (($entryA->{END} >= $_[0])&&($entryA->{START} <= $_[0])){ print O "found within $entryA->{NAME} exon $entryA->{EXON}|"; print O "$entryA->{NAME}|$entryA->{ACCN}|0|0|"; } if ($entryA->{START} > $_[0]){ printf O "found %d bp", ($_[0] - $entryZ->{END}); if ($entryZ->{STRAND} eq "+"){ print O " downstream "; } else { print O " upstream "; } print O "of $entryZ->{NAME} exon $entryZ->{EXON} "; printf O "and %d bp", ($entryA->{START} - $_[0]); if ($entryA->{STRAND} eq "+"){ print O " upstream "; } else { print O " downstream "; } print O "of $entryA->{NAME} exon $entryA->{EXON}|"; print O "$entryZ->{NAME}|$entryZ->{ACCN}|$entryA->{NAME}|$entryA->{ACCN}|"; } } } sub process_scn { my @tab; @tab = (); while ($line = ){ chomp($line); @tab = split(/\s+/, $line); if (scalar(@tab) < 6){ print "\nError:\nInput file is not in the proper format\n"; input_format(); exit; } elsif ($tab[1]!~/\d+/){ print "\nError:\nIn input file: 2nd field must be numeric (location of a motif)\n"; input_format(); exit; } elsif ($tab[3]!~/[ACGTNacgtn]/){ print "\nError:\nIn input file: 4th field must contain only A, C, T, G, or N (motif sequence)\n"; input_format(); exit; } elsif ($tab[2]!~/1/&&$tab[2]!~/0/){ print "\nError:\nIn input file: 3rd field must be either a 0 or 1 (strand of motif; 1 is plus strand, 0 is minus)\n"; input_format(); exit; } elsif ($tab[5]!~/n\d/){ print "\nError:\nIn input file: 6th field must be a motif designation in the form n# (e.g. n1)\n"; input_format(); exit; } $record = { LOCATION => $tab[1], SEQ => $tab[2], ORIENT => $tab[3], MOTIF => $tab[5] }; $bylocation { $record->{LOCATION} } = $record; } } sub process_annot { my @tab; while ($line = ){ chomp $line; @tab = split(/\s+/, $line); if ($line=~/\>/){ next; } elsif (scalar(@tab) < 9) { print "\nError:\nAnnotation file is not in the proper format\n"; Format(); exit; } if ($tab[2] eq "" || $tab[2]!~/^\d+$/){ print "\nError:\nIn annotation file: 3rd field must be numeric (coding start site)\n"; Format(); exit; } if ($tab[3] eq "" || $tab[3]!~/^\d+$/){ print "\nError:\nIn annotation file: 4th field must be numeric (coding end site)\n"; Format(); exit; } if ($tab[4] eq "" || $tab[4]!~/[\+\-]/){ print "\nError:\nIn annotation file: 5th field must be either + or - (strand of coding seq)\n"; Format(); exit; } if ($tab[5] eq ""){ print "\nError:\nIn annotation file: 6th field must specify the gene name\n"; Format(); exit; } if ($tab[8] eq "" || $tab[8]!~/^\d+$/){ print "\nError:\nIn annotation file: 9th field must be numeric (coding exon number)\n"; Format(); exit; } if ($tab[6] eq ""){ print "\nError:\nIn annotation file: 7th field must specify the accession name\n"; Format(); exit; } $recordA = { START => $tab[2], END => $tab[3], STRAND => $tab[4], NAME => $tab[5], ACCN => $tab[6], EXON => $tab[8] }; $bystart {$recordA->{START}} = $recordA; } foreach $sortstart (sort {$a->{START} <=> $b->{START}} values %bystart){ push(@srt, $sortstart); } close ANNOT; open (O, ">$infile.$window.list") or die "Can't open annotation-based output file: $!\n"; } sub AnalyzeParameters { my $i; $infile = ""; $window = ""; $annot = ""; $check = ""; $setmotifs = ""; if (@_ == 0) { Usage(); exit; } for ($i=0;$i<@_;++$i) { if ($_[$i] eq "-i") { $infile = $_[$i+1]; ++$i; } elsif ($_[$i] eq "-w") { $window = $_[$i+1]; $i++; } elsif ($_[$i] eq "-m") { $setmotifs = $_[$i+1]; $i++; } elsif ($_[$i] eq "-c") { $check = $_[$i+1]; $i++; } elsif ($_[$i] eq "-a") { $annot = $_[$i+1]; $i++; } } if ($infile eq "") { die "Missing required parameter -i (input file).\n"; } if (! -e $infile) { die "Input file $infile (-i) does not exist.\n"; } open (FILE, "$infile") or die "Can't open $infile: $!\n"; if ($window eq "") { die "Must specify search window (-w).\n"; } if ($setmotifs eq "") { die "Must specify number of motifs (-m).\n"; } if ($check eq ""){ die "Must specify which motifs to check (-c).\n"; } if ($annot eq ""){ print "No annotations specified... Continuing with analysis.\n"; } else { print "Uploading from annotation file $annot.\n"; open (ANNOT, "$annot") or die "Can't open $annot: $!\n"; process_annot(); print "Done uploading annotations.\n"; } if ($window =~/^\d+$/){} else { die "Search window $window (-w) must be numeric.\n"; } if ($setmotifs =~/^\d+$/){ $setmotifs++; } else { die "Number of motifs $setmotifs (-m) must be numeric.\n"; } if ($check=~/^[\d\,]+$/){ @ch = split (/\,/, $check); $chnum = scalar (@ch); if (($chnum+1)>$setmotifs){ die "Number of motifs to check (-c) MUST be less than or equal to total number of motifs (-m).\n"; } else { print "$chnum motifs designated: "; print "$_ " foreach (@ch); for ($j=0;$j<$setmotifs;$j++){ foreach (@ch){ if ($_==$j){ $motifs[$_] = 1; } } } print "\n"; } } else { die "Motifs to check must be designated as follows: ,, etc.\n"; } } sub Usage { print <<_HELP cooccur_scan.pl Usage: perl cooccur_scan.pl -i -m -w -c -a -i: input file (required) -m: number of motifs to be searched for (required) -w: search window in which to look for co-occurrence of all motifs (required) -c: motifs to be checked, using the format: ,, -a: annotation file (optional) _HELP ; } sub Format { print <<_FORM Annotation files must contain one line per annotation, and each line must have at least 9 WHITESPACE delimited fields, with the following fields set as indicated: Field 3: start bp of coding sequence (must be numeric and less than entry in Field 4) Field 4: end bp of coding sequence (must be numeric and greater than entry in Field 3) Field 5: strand (either a + or -) Field 6: gene name Field 7: accession number (e.g., Flybase ID) Field 9: exon number (must be numeric) NOTE: no header lines are allowed in the annotation files. _FORM ; } sub input_format { print <<_INFORMAT Input files contain the results of a DNA sequence scan for specific motifs. These files will be used to search for co-occurrences of motifs according to the parameters specified on the command line. The input file must be a concatenation of all the motif occurrences for all the motifs to be used in the search, with one line per single motif occurrence. The following fields must be set as indicated: Field 2: location of motif Field 3: motif sequence (only A's, C's, G's, T's, and N's allowed) Field 4: strand of motif (0 indicates minus strand, 1 indicates plus strand) Field 6: motif designation (must be in the format n#, e.g. n1) Example: test_sequence_1 2914 1 ACTGACTGAC 15.9638 n1 test_sequence_1 5500 1 ACTGACGTAC 15.9638 n1 test_sequence_1 8364 0 ACTGACTGAC 15.9638 n1 test_sequence_1 465 1 GGAGCAG 8.0235 n2 test_sequence_1 7465 1 GCAGCAG 8.0235 n2 test_sequence_1 931 0 TACCAGGA 9.4333 n3 test_sequence_1 6931 0 TACCTGGA 9.4333 n3 test_sequence_1 10931 0 AACCAGGG 8.7445 n3 _INFORMAT ; }