#!/usr/bin/perl
use warnings;
use strict;

### This script analyses directional (!!!) RRBS FastQ files (before mapping) and tries to identify reads which contain a second MspI site followed by an A (from A-tailing) and then
### continues to read into the adapter sequence. The number of bases identical to the adapter sequence can be specified below. The script will also output the average Phred score for
### the (non-)conversion position to see if something is fundamentlly wrong. The Phred-score offset needs to be specified below.

unless (@ARGV){
  die "Please provide one or more FastQ files (uncompressed or gzipped) to analyse the bisulfite non-conversion rate in directional RRBS samples\nUSAGE: ./find_RRBS_non_conversion.pl [filename(s)]\n"
}

my $required_adapter_overlap = 5;

warn "\nThe specified RRBS read file(s) will be analysed for their bisulfite non-conversion rate. The required overlap with the Illumina adapter is set to: $required_adapter_overlap bp.\n(If you want to change this stringency please edit the \$required_adapter_overlap variable at the start of this script)\n\n";

my $Phred_encoding = 33;   # change this to 33 for Sanger encoding, or 64 for Illumina 1.5 encoding


###################################################################################################

foreach my $filename (@ARGV){

  if ($filename =~ /\.gz$/){
    open (IN,"zcat $filename |") or die "Couldn't read from file:$!\n";
  }
  else{
    open (IN,$filename) or die "Couldn't read from file:$!\n";
  }

  my $count = 0;
  my $CGG_count = 0;
  my $TGG_count = 0;
  my $converted = 0;
  my $resistant = 0;

  ### note on 28 oct 2011: The adapters used by Samantha Cooper (Illumina) are slightly different to what we use, they start forking 1bp earlier! (possibly TruSeq adapters?):
  # adaptor1                                       GATCGGAAGAGCACACGTCTGAAC  # Standard PE adaptors, read1
  # adaptor2                                       GATCGGAAGAGCGTCGTGTAGGGA  # Standard PE adaptors, read2

  # what we used:
  # the adapter on the 3' end of read 1 reads: (A) GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
  # the adapter on the 3' end of read 2 reads: (A) GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT

  ## since they have the first 13bp in common but then start to diverge (forking-part of the adapters) we are only going to use these first 13bp
  my $adapter = 'GATCGGAAGAGCG';

  my @adapter = split //,$adapter;

  my $T_scores = 0;
  my $C_scores = 0;

  while (1){
    my $identifier = <IN>;
    my $sequence = <IN>;
    my $identifier2 = <IN>;
    my $quality_score = <IN>;
    last unless ($identifier and $sequence and $identifier2 and $quality_score);
    ++$count;
    ## small check if the sequence file appears to be a FastQ file
    if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
      die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
    }
    if ($count%1000000 == 0){
      warn "processed $count sequences\n";
    }

    if ($sequence =~ /^CGG/){
      $CGG_count++;
    }
    elsif ($sequence =~ /^TGG/){
      $TGG_count++;
    }
    else{
      next;
    }

    chomp $sequence;

    ### CASE 1: filled-in residue was converted
    if ($sequence =~ /TTGA(.*)/){   # the A comes from the A-tailing process, the second 'T' would be a converted filled-in cytosine
      my $rest = $1;
      my $poi = 40 - length($rest)-3;

      unless (length($rest) == 0){
	my @rest = split //,$rest;

	my $n_count = 0;
	my $match_count = 0;
	my $mismatch_count = 0;

	for my $index (0..$#rest) {
	
	  last if ($index > $#adapter);
	
	  if ($rest[$index] eq 'N') {
	    ++$n_count;
	    next;
	  }

	  if ($rest[$index] eq $adapter[$index]) {
	    ++$match_count;
	  }
	  else {
	    ++$mismatch_count;
	  }
	}
	
	# requiring a near perfect match to the adapter sequence (just change as desired)
	if ($mismatch_count == 0 and $n_count == 0 and $match_count >= $required_adapter_overlap) {

	  my $qual = substr ($quality_score,$poi,1);
	  my $base = substr ($sequence,$poi,1);
	  my $phred_score = convert_quality_string_into_phred_score($qual);
	  # print "$poi\t$base\t$qual\t$phred_score\n";
	  $converted++;
	  $T_scores += $phred_score;
	}
      }
    }

    ### CASE 2: filled-in residue was not converted
    if ($sequence =~ /TCGA(.*)/){    # the A comes from the A-tailing process, the 'C' would be a non-converted filled-in cytosine
      my $rest = $1;
      my $poi = 40 - length($rest)-3;

      unless (length($rest) == 0){
	my @rest = split //,$rest;

	my $n_count = 0;
	my $match_count = 0;
	my $mismatch_count = 0;

	for my $index (0..$#rest) {
	
	  last if ($index > $#adapter);
	
	  if ($rest[$index] eq 'N') {
	    ++$n_count;
	    next;
	  }

	  if ($rest[$index] eq $adapter[$index]) {
	    ++$match_count;
	  }
	  else {
	    ++$mismatch_count;
	  }
	}
	
	# requiring a near perfect match to the adapter sequence
	if ($mismatch_count == 0 and $n_count == 0 and $match_count >= $required_adapter_overlap) {
	  my $qual = substr ($quality_score,$poi,1);
	  my $base = substr ($sequence,$poi,1);
	  my $phred_score = convert_quality_string_into_phred_score($qual);
	  $resistant++;
	  $C_scores += $phred_score;
	}
      }
    }
  }

  print "\n$count sequences analysed in total\n\n";
  my $percent_CGG = sprintf ("%.1f",$CGG_count/$count*100);
  my $percent_TGG = sprintf ("%.1f",$TGG_count/$count*100);
  print "CGG at the start: $CGG_count\t($percent_CGG% of total sequences)\n";
  print "TGG at the start: $TGG_count\t($percent_TGG% of total sequences)\n\n";

  print "Filled-in positions\nBisulfite converted: $converted\n";
  print "Conversion resistant: $resistant\n";
  my $percentage = sprintf ("%.2f",$resistant/($converted+$resistant)*100);
  print "Percentage non bisulfite converted: $percentage\n\n";

  print "Average Phred score for converted positions:\t";
  printf ("%.1f",$T_scores/$converted);
  print "\n";
  print "Average Phred score for non-converted positions:\t";
  printf ("%.1f",$C_scores/$resistant);
  print "\n\n";
}

sub convert_quality_string_into_phred_score{
  my $string = shift;
  my $qual = ord($string)-$Phred_encoding;
  return $qual;
}
