Title: README for fugu prep
Version: 0.3 (23 Mar 2010)
Author: Brant C. Faircloth
Create: 22 Mar 2010
Edit: 23 Mar 2010
wget http://hgdownload.cse.ucsc.edu/goldenPath/tetNig2/database/all_mrna.sql
wget http://hgdownload.cse.ucsc.edu/goldenPath/tetNig2/database/all_mrna.txt.gz
http://hgdownload.cse.ucsc.edu/goldenPath/tetNig2/database/all_mrna.sql
mv all_mrna.txt all_mrna.22-Mar-2010.txt
mysql> load data infile '/Users/bcf/git/fugu/data/tetNig2/all_mrna.22-Mar-2010.txt' \
into table x_mrna_all fields terminated by '\t';
Use show create table est
to show the table structure for the est data modify this create table statement to add an autoincrementing primary key we need this key later to maintain referential integrity
CREATE TABLE `mrna` (
`id` int unsigned NOT NULL auto_increment,
`bin` smallint(5) unsigned NOT NULL default '0',
`matches` int(10) unsigned NOT NULL default '0',
`misMatches` int(10) unsigned NOT NULL default '0',
`repMatches` int(10) unsigned NOT NULL default '0',
`nCount` int(10) unsigned NOT NULL default '0',
`qNumInsert` int(10) unsigned NOT NULL default '0',
`qBaseInsert` int(10) unsigned NOT NULL default '0',
`tNumInsert` int(10) unsigned NOT NULL default '0',
`tBaseInsert` int(10) unsigned NOT NULL default '0',
`strand` char(2) NOT NULL default '',
`qName` varchar(255) NOT NULL default '',
`qSize` int(10) unsigned NOT NULL default '0',
`qStart` int(10) unsigned NOT NULL default '0',
`qEnd` int(10) unsigned NOT NULL default '0',
`tName` varchar(255) NOT NULL default '',
`tSize` int(10) unsigned NOT NULL default '0',
`tStart` int(10) unsigned NOT NULL default '0',
`tEnd` int(10) unsigned NOT NULL default '0',
`blockCount` int(10) unsigned NOT NULL default '0',
`blockSizes` longblob NOT NULL,
`qStarts` longblob NOT NULL,
`tStarts` longblob NOT NULL,
PRIMARY KEY (id),
KEY `tName` (`tName`(8),`bin`),
KEY `tName_2` (`tName`(8),`tStart`),
KEY `qName` (`qName`(12)),
KEY `tName_3` (`tName`(8),`tEnd`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into mrna (bin, matches, misMatches, repMatches, nCount,
qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSize,
qStart, qEnd, tName, tSize, tStart, tEnd, blockCount, blockSizes, qStarts,
tStarts) select * from x_mrna_all;
drop table x_mrna_all;
python splitStart.py --configuration=data/tetNig2/db.conf \
--tablename=starts --reftablename=mrna
using the data in the newly created starts
table (from splitStart.py above), go through regions, compact that all done into a single region
(composed of elements in region_components
and record the area covered by the region
python splitCluster.py --configuration=data/tetNig2/db.conf --distance=0 \
--minoverlaps=0 --tablename=starts --reftablename=mrna
rename table region to exons;
rename table region_components to exon_components;
alter table exons add foreign key (mrna_id) references mrna (id);
create table genes (id int unsigned not null auto_increment, mrna_id int
unsigned not null, chromo varchar(15) default NULL, start int unsigned not
null, end int unsigned not null, primary key (id), index(mrna_id), foreign
key (mrna_id) references mrna(id)) ENGINE=InnoDB charset=utf8;
insert into genes (mrna_id) select distinct(mrna_id) from exons;
alter table genes add foreign key (mrna_id) references mrna (id);
python makeBedFromRegion.py --configuration=data/tetNig2/db.conf \
--out=data/tetNig2/tetNig2_putativeGenes.bed --advanced
(do this via UCSC; overlap >= 80%)
python geneMinMax.py --configuration=data/tetNig2/db.conf
python transalignParser.py --configuration=data/tetNig2/db.conf \
--input=data/tetNig2/tetNig2PutativeGeneIntersecetTransmapReverse.bed
python ncbiQuery.py --configuration=data/tetNig2/db.conf
select category, description, count(refseq.id) from go, refseq, gorefs
where category = 'Function' and refseq.source = 'Homo sapiens' and
gorefs.refseq_id = refseq.id and go.id = gorefs.go_id group by
go.description order by count(refseq.id) desc into outfile
'~/Database/tetNig2/tetNigHomoSapGo.Function.Terms.txt' fields terminated by '\t' ;
select category, description, count(refseq.id) from go, refseq, gorefs
where category = 'Process' and refseq.source = 'Homo sapiens' and
gorefs.refseq_id = refseq.id and go.id = gorefs.go_id group by
go.description order by count(refseq.id) desc into outfile
'~/Database/tetNig2/tetNigHomoSapGo.Process.Terms.txt' fields terminated by '\t' ;
select category, description, count(refseq.id) from go, refseq, gorefs
where category = 'Function' and refseq.source = 'Mus musculus' and
gorefs.refseq_id = refseq.id and go.id = gorefs.go_id group by
go.description order by count(refseq.id) desc into outfile
'~/Database/tetNig2/tetNigMusMus.Function.Terms.txt' fields terminated by '\t' ;
select category, description, count(refseq.id) from go, refseq, gorefs
where category = 'Process' and refseq.source = 'Mus musculus' and
gorefs.refseq_id = refseq.id and go.id = gorefs.go_id group by
go.description order by count(refseq.id) desc into outfile
'~/Database/tetNig2/tetNigMusMus.Process.Terms.txt' fields terminated by '\t' ;
select category, description, count(refseq.id) from go, refseq, gorefs
where category = 'Function' and refseq.source = 'Danio rerio' and
gorefs.refseq_id = refseq.id and go.id = gorefs.go_id group by
go.description order by count(refseq.id) desc into outfile
'~/Database/tetNig2/tetNigDanRerGo.Function.Terms.txt' fields terminated by '\t' ;
select category, description, count(refseq.id) from go, refseq, gorefs
where category = 'Process' and refseq.source = 'Danio rerio' and
gorefs.refseq_id = refseq.id and go.id = gorefs.go_id group by
go.description order by count(refseq.id) desc into outfile
'~/Database/tetNig2/tetNigDanRerGo.Process.Terms.txt' fields terminated by '\t' ;
(No hits)
(No hits)
wget http://hgdownload.cse.ucsc.edu/goldenPath/tetNig2/bigZips/chromFa.tar.gz
tar -xzvf chromFa.tar.gz
faToTwobit sequence/*.fa tetNig2.2bit
python exonsToFakeReads.py --twobit=data/tetNig2/tetNig2.2bit \
--configuration=data/tetNig2/db.conf --exons=exons --genes=genes \
--output=data/tetNig2/myExons.fa
wget http://hgdownload.cse.ucsc.edu/goldenPath/fr2/bigZips/chromFa.tar.gz
faTwoTwobit sequence/* fr2.2bit
lastz our exons in Tetraodon to all sequence in Fugu (33 min execution) [first ran this as coverage=80 identity=90]. Placed everything in 80_80 or 80_90 directories.
python ../../../../seqcap/Alignment/easyLastz.py \
--target=../../fr2/fr2.2bit \
--query=myExons.2bit --coverage=80 --identity=90 \
--output=myExonsMatchToFugu.80_80.lastz
mv myExons.2bit lastz/
mv myExons.fa lastz/
mv *.lastz lastz/
cp *.lastz myExonsMatchToFugu.lastz.initial
sed 's/%//g' myExonsMatchToFugu.lastz > myExonsMatchToFugu.lastz.clean
fuguMatch | CREATE TABLE `fugumatch` (
`exons_id` int(10) unsigned NOT NULL,
`mrna_id` int(10) unsigned NOT NULL,
`score` int(10) unsigned NOT NULL,
`name1` varchar(20) NOT NULL,
`strand1` varchar(1) NOT NULL,
`zstart1` int(10) unsigned NOT NULL,
`end1` int(10) unsigned NOT NULL,
`length1` smallint(5) unsigned NOT NULL,
`name2` varchar(100) NOT NULL,
`strand2` varchar(1) NOT NULL,
`zstart2` smallint(5) unsigned NOT NULL,
`end2` smallint(5) unsigned NOT NULL,
`length2` smallint(5) unsigned NOT NULL,
`diff` text NOT NULL,
`cigar` text NOT NULL,
`identity` varchar(12) DEFAULT NULL,
`percent_identity` float DEFAULT NULL,
`continuity` varchar(12) DEFAULT NULL,
`percent_continuity` float DEFAULT NULL,
`coverage` varchar(12) DEFAULT NULL,
`percent_coverage` float DEFAULT NULL,
KEY `exons_id` (`exons_id`),
KEY `mrna_id` (`mrna_id`),
KEY `percent_identity` (`percent_identity`),
KEY `percent_coverage` (`percent_coverage`),
CONSTRAINT `fugumatch_ibfk_1` FOREIGN KEY (`exons_id`) REFERENCES `exons` (`id`),
CONSTRAINT `fugumatch_ibfk_2` FOREIGN KEY (`mrna_id`) REFERENCES `mrna` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
python ../../../../lastzInserter.py --configuration=../../db.conf \
--input=myExonsMatchToFugu.80_80.lastz.clean
create temporary table dup as select exons_id as id, count(*) as n from
fugumatch group by id having n > 1;
alter table fugumatch add column duplicate bool not null;
update fugumatch, dup set duplicate = 1 where fugumatch.exons_id = dup.id;
CREATE TABLE `fugugenes` (
`genes_id` int(10) unsigned NOT NULL,
`mrna_id` int(10) unsigned NOT NULL,
`chromo` varchar(15) DEFAULT NULL,
`start` int(10) unsigned NOT NULL,
`end` int(10) unsigned NOT NULL,
`span_diff` int(10) unsigned NOT NULL,
`avg_coverage` float unsigned NOT NULL,
`avg_identity` float unsigned NOT NULL,
Index (genes_id),
Index (mrna_id),
Index (span_diff),
Index (avg_coverage),
Index (avg_identity),
FOREIGN KEY (`genes_id`) REFERENCES `mrna` (`id`),
FOREIGN KEY (`mrna_id`) REFERENCES `mrna` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
determine which entire genes (denoted by the match of exons in tetraodon with those in fugu) are present in fugu. This avoids duplicate regions identified by lastz, and it returns the difference in the "span" of a gene, as well as metrics of it's coverage and conservation (measured as % identity).
python fuguGeneRegionChecker.py --configuration=data/tetNig2/db.conf