to download from ensembl ftp site
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 9 Nov 2011 20:07:36 +0000 (20:07 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 9 Nov 2011 20:07:36 +0000 (20:07 +0000)
forester/ruby/scripts/ensembl_ftp.rb [new file with mode: 0644]

diff --git a/forester/ruby/scripts/ensembl_ftp.rb b/forester/ruby/scripts/ensembl_ftp.rb
new file mode 100644 (file)
index 0000000..a38753f
--- /dev/null
@@ -0,0 +1,32 @@
+require 'net/ftp'
+
+EMAIL           = 'czmasek@burnham.org'
+PUB_RELEASE_DIR = '/pub/release-64/fasta'
+PEP_DIR         = '/pep'
+
+ftp = Net::FTP.new('ftp.ensembl.org', 'anonymous', EMAIL)
+ftp.passive = true # To avoid "No route to host" error.
+ftp.chdir( PUB_RELEASE_DIR )
+files = ftp.list('*_*') # To only look at files with an underscore.
+count = 0
+files.each do | file |
+  species = file.split().last
+  begin
+    ftp.chdir(species + PEP_DIR)
+    pepfiles = ftp.list()
+    pepfiles.each do | pepfile |
+      pepfile = pepfile.split().last
+      if pepfile =~ /all.fa.gz/ # Only want the "all.fa.gz" files (and not the
+                                # "abinitio" files).
+        ftp.getbinaryfile(pepfile)
+        puts 'downloaded "' + pepfile + '"'
+        count += 1
+      end
+    end
+  rescue Exception
+    puts 'ignoring "' + species + '"'
+  end
+  ftp.chdir(PUB_RELEASE_DIR) # To go back to the starting directory.
+end
+ftp.close
+puts 'done (downloaded ' + count.to_s + ' files)'
\ No newline at end of file