View Single Post
Old 04-15-2014, 12:35 AM   #2
dpryan
Devon Ryan
 
Location: Freiburg, Germany

Join Date: Jul 2011
Posts: 3,480
Default

Here's an example script I wrote in R that takes an annotation GFF file prepared by DEXSeq and adds "intronic_part" records with associated gene_ids. You'll find both the original exonic bins and the newer intronic bins in the resulting file (you'd need to change the file names), which you may or may not want. You could also modify this script to make the intronic bins "exonic bins" and just look at exons and introns at once (I don't know how well that'd work) or remove the old exonic bins and simply label the intronic_parts exonic_parts, which would probably make your life easier in DEXSeq (I just modified DEXSeq). For the latter, just change the "asGFF2()" function.

Code:
#!/usr/bin/Rscript
library(GenomicRanges)
library(rtracklayer)
library(parallel)

gff <- import.gff2("Mus_musculus.GRCm38.71.DEXSeq.gff", asRangedData=F)
#Add a level for "intronic_part"
elementMetadata(gff)$type <- factor(elementMetadata(gff)$type, levels=c(levels(elementMetadata(gff)$type), "intronic_part"))
#Fix the exonic_part_number to be a properly formatted character
USE <- which(!is.na(elementMetadata(gff)$exonic_part_number))
exonic_parts <- sprintf("%03i", elementMetadata(gff)$exonic_part_number[USE])
elementMetadata(gff)$exonic_part_number <- as.character(elementMetadata(gff)$exonic_part_number)
elementMetadata(gff)$exonic_part_number[USE] <- exonic_parts

#Split by gene_id
grl <- split(gff, elementMetadata(gff)$gene_id)
#Add the introns as "intronic_parts
add_introns <- function(gr) {
    exons <- gr[which(elementMetadata(gr)$type=="exonic_part"),]
    if(length(exons) > 1) {
        seqname <- seqnames(exons)[-1]
        starts <- end(exons)+1
        starts <- starts[-length(starts)]
        ends <- start(exons)-1
        ends <- ends[-1]
        bounds <- IRanges(start=starts, end=ends)
        strand <- strand(exons)[-1]
        introns <- GRanges(seqnames=seqname, ranges=bounds, strand=strand)
        intron_ids <- sprintf("%03i", c(1:length(introns)))
        #Remove 0-width introns
        DISCARD <- which(width(introns) <= 0)
        if(length(DISCARD) > 0) {
            introns <- introns[-DISCARD]
            intron_ids <- intron_ids[-DISCARD] #Set intron numbers so they follow their respective exonic parts
        }
        if(length(introns) > 0) {
            #create the meta-data
            df <- as.data.frame(elementMetadata(exons))
            nrows <- length(introns)
            metadf <- df[1:nrows,] #does this need to deal with gene_id and transcripts differently?
            metadf <- transform(metadf, gene_id=as.character(gene_id), transcripts=as.character(transcripts))
            metadf$transcripts <- as.character(c(rep(NA, nrows)))
            metadf$type <- factor(c(rep("intronic_part", nrows)), levels=levels(metadf$type))
            metadf$exonic_part_number <- intron_ids
            elementMetadata(introns) <- metadf
            #Merge the GRanges
            gr <- append(gr, introns)
            gr <- gr[order(start(gr), elementMetadata(gr)$type),] #resort
        }
    }
    return(gr)
}
with_introns <- endoapply(grl, add_introns) 
#reorder things
chroms <- sapply(with_introns, function(x) as.factor(seqnames(x))[1])
starts <- sapply(with_introns, function(x) start(x)[1])
o <- order(chroms, starts)
with_introns2 <- with_introns[o]
##Merge into a GRange
#with_introns2 <- unlist(with_introns2, use.names=F, recursive=T)
#Create GFF formatted output
asGFF2 <- function(x) {
    df <- as.data.frame(x)
    aggregates <- which(df$type == "aggregate_gene")
    meta <- character(nrow(df))
    meta[aggregates] <- sprintf("gene_id \"%s\"", df$gene_id[aggregates])
    #This gives introns a transcript "NA" field, which may not be ideal
    meta[-aggregates] <- sprintf("transcripts \"%s\"; exonic_part_number \"%s\"; gene_id \"%s\"", df$transcripts[-aggregates], df$exonic_part_number[-aggregates], df$gene_id[-aggregates])
    paste(df$seqnames, "dexseq_prepare_annotation.py", df$type, df$start, df$end, ".", df$strand, ".", meta, sep="\t")
}
outputGFF <- unlist(lapply(with_introns2, asGFF2))
write.table(outputGFF, file="Mus_musculus.GRCm38.71.DEXSeq.introns.gff", row.names=F, col.names=F, quote=F)
dpryan is offline   Reply With Quote