#!/usr/bin/env ruby
MY_NAME = "vbasyn"
MY_VERSION = "0.0.9"
def usage
print "
== Synopsis (#{MY_NAME}-#{MY_VERSION})
vbasyn.rb: Script for partial or complete vba repository mirroring
(http://anti-virus.by/update). Known to work on: Linux, FreeBSD,
MS Windows with installed ruby-1.8.0 or newer (http://www.ruby-lang.org/)
== Usage
vbasyn.rb [-x proxy-uri] [-V] [-u user-id] [-v] [-1] [-p] [-P] [-s sort_type] repo-uri target-dir project-list
vbasyn.rb [-x proxy-uri] [-V] [-u user-id] [-v] [-1] [-p] [-P] [-s sort_type] -a repo-uri target-dir
vbasyn.rb [-x proxy-uri] [-V] [-u user-id] [-v] [-1] -l repo-uri project-list
vbasyn.rb [-x proxy-uri] [-V] [-u user-id] [-v] [-1] -a -l repo-uri
== Examples
./vbasyn.rb --verbose http://anti-virus.by/update /tmp/vba32 VBA32_W VBA32_L
./vbasyn.rb --verbose --all http://anti-virus.by/update /tmp/vba32
./vbasyn.rb --proxy http://user:password@proxy.host:3128 --verbose --all http://anti-virus.by/update /tmp/vba32
== Options
-h, --help - show help
-v, --verbose - be verbose :)
-l, --print-list - print full list of links to download (do not perform actual download)
-1, --compat-v1 - create uncompressed binaries (.bin extension) and
ini files (.ini extension) (v1 repository format).
-V, --version - show version
-u, --announce-as uid - Insert local user info into version and User-agent. This cookie helps
us to distinguish mirrorers. Useful for us to estimate server load.
-a, --all - mirror all product sets
-d, --debug - show debug info. useful for developers and advanced users
-p, --prefetch - preserves old repository metadata without cleaning any files.
Sometimes it's useful to prefetch all large files and
not change state of your repository.
-P, --progress - show download progress for current file
-s, --sort - sort order for downloaded files. Valid values are:
size
name (default)
ext
-x, --proxy proxy-uri - use proxy-uri as a content provider. Username and password
are supported in a regular way. Some examples:
--proxy http://proxy:3128
--proxy http://user:password@proxy:3128
WARNING! this option requires ruby-1.9 and upper
== Parameters
repo-uri - full uri to repository (can be ftp, http)
example: http://anti-virus.by/update
target-dir - absolute or relative path to dir where to store data.
Dir shoud be created manually before script started.
examples: /srv/vba32
./vba32/
c:\\www\\vba32
project-list - one or more projects desired to synchronize.
It's strongly recommended to use proper symbol
case in project names:
examples : VBA32_L (not vba32_l, not Vba32_l)
VBA32_KW vbasmf-linux-glibc2.3 (not VBA32_KW VBASMF-LINUX-GLIBC2.3)
== Notes
If you use HTTP webservers to expose your update mirror - be sure you denied caching
of *.ini and *.ini.gz files in your webserver configuration.
We are not HTTP servers experts, so we recommend you to check up-to-date manual for
your webserver.
Anyway, described below variants are known to work in practice.
Let's assume you have following directory layout:
ON DISK DIR => EXPOESD BY HTTP
wwwroot/update/ => /update/
wwwroot/beta/update/ => /beta/update/
Apache users:
create file (append to existing) wwwroot/.htaccess with following content:
Header Set Cache-Control \"max-age=0, no-store\"
Header Set Cache-Control \"max-age=0, no-store\"
Lighttpd users:
add following lines to your lighttpd.conf:
# any .ini
$HTTP[\"url\"] =~ \"/update/.*\\.ini$\" {
setenv.add-response-header = (
\"Cache-Control\" => \"private, pre-check=0, post-check=0, max-age=0\",
\"Expires\" => \"0\",
\"Pragma\" => \"no-cache\"
)
}
# any .ini.gz
$HTTP[\"url\"] =~ \"/update/.*\\.ini\\.gz$\" {
setenv.add-response-header = (
\"Cache-Control\" => \"private, pre-check=0, post-check=0, max-age=0\",
\"Expires\" => \"0\",
\"Pragma\" => \"no-cache\"
)
}
== THEORY OF OPERATION ==
Why do we need this stuff at all?
Our databases update very frequently (usually twice a day),
so you can simply run into problem mirroring inconsistent repository.
This script aims to keep local repository consistent on each step.
It's very simple. What do we do:
* fetch_needed_files - fetch new binary files
* store_inis - update .ini/.ini.gz files allowing clients to actually use new update
* cleanup_repository - remove old cruft
Otherwise we will likely get inconsistent state of information in .ini/.ini.gz files and
actual binaries on update resource.
== REPORTING BUGS ==
Maintainter: Sergei Trofimovich
2008, 2009, 2010 VirusBlokAda Ltd.
Public domain.
== The End ==
"
exit 1
end
def short_usage
print "
Try -h or --help option to get detailed info
"
exit 1
end
require 'zlib'
require 'open-uri'
require 'digest/md5'
require 'find'
require 'getoptlong'
#
# parse options
#
opts = GetoptLong.new(
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
[ '--print-list', '-l', GetoptLong::NO_ARGUMENT ],
[ '--compat-v1', '-1', GetoptLong::NO_ARGUMENT ],
[ '--version', '-V', GetoptLong::NO_ARGUMENT ],
[ '--announce-as', '-u', GetoptLong::REQUIRED_ARGUMENT ],
[ '--all', '-a', GetoptLong::NO_ARGUMENT ],
[ '--debug', '-d', GetoptLong::NO_ARGUMENT ],
[ '--prefetch', '-p', GetoptLong::NO_ARGUMENT ],
[ '--progress', '-P', GetoptLong::NO_ARGUMENT ],
[ '--sort', '-s', GetoptLong::REQUIRED_ARGUMENT ],
[ '--proxy', '-x', GetoptLong::REQUIRED_ARGUMENT ]
)
$verbose = nil
$debug = nil
$print_list = nil
$compat_v1 = nil
$show_version = nil
$announce_as = nil
$sync_all = nil
$prefetch = nil
$show_progress = nil
$sort_by = 'name'
$proxy = nil
def my_id
local_user_info = ""
local_user_info = "; user: '#{$announce_as}'" if $announce_as
return "#{MY_NAME}-#{MY_VERSION} (platform: #{RUBY_PLATFORM}; ruby: #{RUBY_VERSION}#{local_user_info})"
end
###
# not very stable and unintuitive :]. currently it's an undocumented option
###
if ARGV.size == 2 and (ARGV[0] == '--config' or ARGV[0] == '-c')
new_options = File.open(ARGV[1], "r"){|f| f.to_a.collect{|e| e.chomp} }
ARGV.clear
ARGV.concat new_options
end
opts.each{|opt, arg|
case opt
when '--help'
usage
when '--verbose'
$verbose = 1
when '--debug'
$debug = 1
when '--print-list'
$print_list = 1
when '--compat-v1'
$compat_v1 = 1
when '--version'
$show_version = 1
when '--announce-as'
$announce_as = arg
when '--all'
$sync_all = 1
when '--prefetch'
$prefetch = 1
when '--progress'
$show_progress = 1
when '--sort'
valid_sort_types = ['size', 'ext', 'name']
if !valid_sort_types.include?(arg)
$stderr.print "ERROR: bad '#{opt}' option: '#{arg}'. One of #{valid_sort_types.inspect} expected.\n"
exit 1
end
$sort_by = arg
when '--proxy'
$proxy = URI.parse arg
end
}
def vprint(str)
$stderr.print "#{str}\n" if $verbose
end
def dprint(str)
$stderr.print "#{str}\n" if $debug
end
def eprint(str)
$stderr.print "#{str}\n"
end
def nothrow(&block)
begin
block.call
rescue => e
eprint "WARN: #{e}"
vprint e.backtrace.collect {|l| "#{l}\n"}
vprint "end of WARN: #{e}"
end
end
def get_md5(filename)
hashfunc = Digest::MD5.new()
File.open(filename, "rb"){|fh|
while (!fh.eof)
hashfunc.update(fh.read(65536))
end
}
return hashfunc.hexdigest.upcase
end
def gunzip_string(gz_string)
temp_gz_string = gz_string.clone
class << temp_gz_string
def init
@offset = 0
self
end
def read (r_size = -1)
r_size = self.size - @offset if r_size == -1
r_size = (self.size - @offset) if (@offset + r_size) > self.size
temp = self[@offset, r_size]
@offset += temp.size
return temp
end
end
reader = Zlib::GzipReader.new(temp_gz_string.init)
temp = reader.read()
reader.close
return temp
end
########################################
def is_gz_file(file)
return file =~ /\.gz$/i
end
def validate_gz_file(gz_file)
Zlib::GzipReader.open(gz_file) {|gzip|
# TODO: read upto eof
while (!gzip.eof)
result = gzip.read(65536)
raise "unabe to unpack" if result.size < 1
end
}
end
def valid?(file, file_size, md5, unpacked_file)
begin
return nil if !FileTest.exists?(file)
raise "#{file}: size failed: #{FileTest.size(file)} wanted #{file_size}" if File.size(file) != file_size
raise "md5 failed" if get_md5(file) != md5
rescue => e
dprint file + " needs update: " + e.to_s
# TODO: continue download if less
nothrow{ File.delete(file) if FileTest.exists?(file) }
return nil
end
if is_gz_file(file)
validate_gz_file(file)
if unpacked_file and $compat_v1
if !FileTest.exists?(unpacked_file)
vprint "UNPACK: #{file} -> #{unpacked_file}"
Zlib::GzipReader.open(file) {|gzip|
open(unpacked_file, "wb") {|g|
while (!gzip.eof)
g.write gzip.read(65536)
end
}
}
end
# TODO:
# we can try to validate unpacked file here
# by size/checksum
end
end
return true
end
def uri_open(uri, perms, opts = {}, &block)
opts ||= {}
opts = opts.merge({"User-Agent" => my_id}) if !opts["User-Agent"]
if $proxy
# needs ruby-1.9 open-uri
opts = opts.merge({:proxy_http_basic_authentication => [$proxy, $proxy.user, $proxy.password]}) if !opts[:proxy_http_basic_authentication]
end
r = nil
vprint "URI-OPEN: #{uri}"
open(uri, perms, opts){|f|
r = yield f
}
return r
end
###################################
# used for fetching files from repo
#
def get_repo_file(uri, target, file, opts={})
dprint "get_repo_file(uri='#{uri}', target='#{target}', file='#{file}')"
local_file = target + '/' + file
remote_file = uri + '/' + file
vprint "FETCH: -> #{local_file}"
uri_open(remote_file, "rb", opts){ |f|
open(local_file, "wb"){ |g|
while (!f.eof)
g.write(f.read(65536))
end
}
}
return local_file
end
##############################
# unlike get_repo_file it stores
# files directly to target
#
def fetch_repo_file(uri, target, file, file_size, md5, unpacked_name, opts={})
local_file = target + '/' + file
unpacked_file = if unpacked_name
target + '/' + unpacked_name
else
nil
end
return local_file if valid?(local_file, file_size, md5, unpacked_file)
get_repo_file(uri, target, file, opts)
return local_file if valid?(local_file, file_size, md5, unpacked_file)
raise "unable to get #{file} from #{uri}"
end
def store_small_file(target_file_path, new_contents)
#TODO: make .ini subst atomic via create+rename
old_contents = begin
File.open(target_file_path, "rb"){|f| f.read }
rescue
nil
end
if old_contents != new_contents
vprint "STORE: #{File.basename target_file_path}"
File.open(target_file_path, "wb"){|f|
f.write new_contents
}
else
dprint "KEEP: #{target_file_path}"
end
end
def store_inis(target, inis_map)
inis_map.each {|f, data|
ini_gz_path = target + '/' + f
store_small_file(ini_gz_path, data)
#old repo format: store unzipped inis
if $compat_v1
ini_path = ini_gz_path[0..-4] # strip ".gz"
ungz_data = Zlib::GzipReader.open(ini_gz_path) {|gzip| gzip.read }
store_small_file(ini_path, ungz_data)
end
}
end
############################
#
# result: hash = { 'files' =>
# { filename => { 'size' => size,
# 'md5' => md5 },
# 'unpacked' => name,
# ... },
# 'id' => string,
# attr => string } # attr ::= "Version" | "Signature" | "Date" .
def parse_ini_string(ini_string)
result = {}
result['files'] = {}
ini_string.each_line{|l|
line = l.chomp
# [name]
if line =~ /^\[(.*)\]$/
result['id'] = $1
next
end
# Events.cfg=bf4cbdae.bin,AGENT,12257,BF4CBDAEE63A8481E7EE6E9E7F1C5B97,0
if line =~ /^.*\=(.*)\,.*\,(.*)\,(.*)\,.*$/
unp_file_name = $1
file_size = $2.to_i
file_md5 = $3
result['files'][file_md5] = { 'size' => file_size,
'md5' => file_md5,
'unpacked' => unp_file_name }
next
end
# key=val
if line =~ /^(.*)\=(.*)$/
result[$1] = $2
next
end
}
return result
end
############################
# result: hash = { 'files' =>
# { filename => { 'size' => size,
# 'md5' => md5 },
# patch_name => { 'size' => size,
# 'md5' => md5 },
# 'patch' => { 'from' => old_file_md5,
# 'to' => new_file_md5 } }
# ...
# },
# }
def parse_patches_string(patches_string)
result = {}
result['files'] = {}
patches_string.each_line{|l|
line = l.chomp
# + new_file_md5 packed_file_size md5_packed_file
if line =~ /^\+ ([^\s]+) (\d+) ([^\s]+)/
new_file_md5 = $1
packed_file_size = $2.to_i
packed_file_md5 = $3
result['files'][new_file_md5] = { 'size' => packed_file_size,
'md5' => packed_file_md5 }
next
end
# * old_file_md5 new_file_md5 packed_file_size md5_packed_file
if line =~ /^\* ([^\s]+) ([^\s]+) (\d+) ([^\s]+)/
old_file_md5 = $1
new_file_md5 = $2
packed_file_size = $3.to_i
packed_file_md5 = $4
result['files'][new_file_md5 + '-' + old_file_md5 + '.bdt'] = { 'size' => packed_file_size,
'md5' => packed_file_md5,
'patch' => { 'from' => old_file_md5,
'to' => new_file_md5 } }
next
end
}
return result
end
###########################
# result: hash = { 'files' =>
# { filename => { 'size' => size,
# 'md5' => md5 },
# ... },
# 'ini' => { filename => filecontents ,
# ... },
#
# 'projects' => [ "list", "of", "projects", "to", "syn" ],
# 'md5sums' => { filename => md5, ... },
# 'sha1sums' => { filename => sha1, ... }
# }
###
# Do not fear of md5 redundancy :]
# 'files' - info from needed .ini.gz
# 'ini' - unpacked .ini.gz, mentioned above
# 'projects' - projects we plan to sync
# 'md5sums'/'sha1sums' - sums for all available files in repo
###
def get_repo_info(uri, projects)
dprint "get_repo_info(uri='#{uri}', projects='#{projects}')"
result = {}
result['files'] = {}
result['ini'] = {}
result['md5sums'] = {}
result['sha1sums'] = {}
# read all repo info
dprint "get_repo_info: fetching 'MD5SUMS.ini.gz'"
result['ini']['MD5SUMS.ini.gz'] = uri_open(uri + '/' + 'MD5SUMS.ini.gz', "rb"){|f| f.read }
gunzip_string(result['ini']['MD5SUMS.ini.gz']).each_line{|l|
file_md5, file_name = l.split
#dprint "get_repo_info: MD5('#{file_name}') = '#{file_md5}'"
result['md5sums'][file_name] = file_md5.upcase
}
dprint "get_repo_info: fetching 'SHA1SUMS.ini.gz'"
result['ini']['SHA1SUMS.ini.gz'] = uri_open(uri + '/' + 'SHA1SUMS.ini.gz', "rb"){|f| f.read }
gunzip_string(result['ini']['SHA1SUMS.ini.gz']).each_line{|l|
file_sha1, file_name = l.split
#dprint "get_repo_info: SHA1('#{file_name}') = '#{file_sha1}'"
result['sha1sums'][file_name] = file_sha1.upcase
}
dprint "get_repo_info: fetching 'patches.ini.gz'"
patches_string_gz = uri_open(uri + '/' + 'patches.ini.gz', "rb"){|f| f.read }
result['ini']['patches.ini.gz'] = patches_string_gz
patches_string = gunzip_string(patches_string_gz)
patches = parse_patches_string(patches_string)
# get projects as grep of .ini.gz
projects = result['md5sums'].keys.select{|e| e =~ /\.ini\.gz$/}.collect{|e| e[0..-8]} if $sync_all
result['projects'] = projects
dprint "get_repo_info: parsing .ini files"
ini_files = {}
projects.sort.each {|p|
# TODO: read partial
dprint "get_repo_info: fetching '#{p + '.ini.gz'}'"
ini_string_gz = uri_open(uri + '/' + p + '.ini.gz', "rb"){|f| f.read }
result['ini'][p + '.ini.gz'] = ini_string_gz
ini_string = gunzip_string(ini_string_gz)
r = parse_ini_string(ini_string)
ini_files = ini_files.merge(r['files'])
}
dprint "get_repo_info: building mapping of 'destination_file' -> [patches]"
patches_to = {}
patches['files'].each{|pair|
p_name, p_info = pair
dest_name = p_name
if i = p_info['patch']
dest_name = i['to']
end
patches_to[dest_name] = [] if patches_to[dest_name].nil?
patches_to[dest_name].push pair
}
dprint "get_repo_info: computing needed deltas"
# search all relevant delta patches
ini_files.each {|f_name, f_info|
# right now for every file in repo we have patches entry
# but when we introduce multistep patching in might stop
# being vaid
patches_to[f_name].each{|p_name, p_info|
if (p_name == f_name) or (p_info['patch'] and f_name == p_info['patch']['to'])
result['files'][p_name + '.gz'] = p_info
result['files'][p_name + '.gz']['unpacked'] = f_info['unpacked'] if f_name == p_name # just a compressed file
end
}
}
dprint "get_repo_info: done"
return result
end
#
# extracts meaningful value for to-be-fetched file sorting
#
def file_value_fn(file, files)
case $sort_by
when 'ext'
if (i = file.index '.')
return file[i + 1 .. -1]
end
return ""
when 'name'
return file
when 'size'
return files[file]['size']
else
# must not happen
raise "ERROR: unknown sort type: '#{$sort_by}'"
end
end
# pretty print file size
K = 1024
M = K*K
def pp_size(sz)
return sprintf("%uB", sz) if sz < K
return sprintf("%.1fKB", sz.to_f / K.to_f) if sz < M
return sprintf("%.1fMB", sz.to_f / M.to_f)
end
def sum(arr)
result = 0
arr.each{|e| result += e }
return result
end
def fetch_needed_files(repo, target, files)
fetched_files = 0
files_to_fetch = files.size
fetched_bytes = 0
bytes_to_fetch = sum(files.collect{|f,v| v['size']})
last_banner_length = 0
last_update = 0.0
current_file_size = 0
fetch_progress_proc = Proc.new{|fetched_part|
if $show_progress
curr = Time.now.to_f
if last_update + 0.3 < curr # 0.3 seconds passed
last_update = curr
banner = sprintf("STAT: [%u of %u] %s / %s (%.2f%%) | current: %s / %s",
fetched_files, files_to_fetch,
pp_size(fetched_bytes + fetched_part), pp_size(bytes_to_fetch),
(fetched_bytes + fetched_part).to_f * 100.0 / bytes_to_fetch.to_f,
pp_size(fetched_part), pp_size(current_file_size))
banner += ' ' * [0, last_banner_length - banner.length].max
$stderr.print("\r" + banner)
last_banner_length = [last_banner_length, banner.length].max
end
end
}
files.keys.sort_by{|f| file_value_fn(f, files)}.each{|f|
attr = files[f]
# it is modified/used by :progress_proc of open-uri
last_banner_length = 0
current_file_size = attr['size']
dprint "FNF: f=#{f} md5=#{attr['md5']} s=#{attr['size']} u=#{attr['unpacked']}"
fetch_repo_file(repo, target, f, attr['size'], attr['md5'], attr['unpacked'],
:progress_proc => fetch_progress_proc)
fetched_files += 1
fetched_bytes += attr['size']
if $show_progress
banner = sprintf("STAT: [%u of %u] %s / %s (%.2f%%) | last: %s",
fetched_files, files_to_fetch,
pp_size(fetched_bytes), pp_size(bytes_to_fetch),
fetched_bytes.to_f * 100.0 / bytes_to_fetch.to_f,
pp_size(current_file_size))
banner += ' ' * [0, last_banner_length - banner.length].max
$stderr.print("\r" + banner + "\n")
end
}
end
def show_file_list(repo, files)
files.each{|f, attr|
print "#{repo}/#{f} #{attr['md5']}\n"
}
end
def cleanup_repository(target, files_to_hold)
all_files = []
Find.find(target){|f|
next if FileTest.directory?(f)
fname = File.basename(f)
# gzipped deltas
all_files += [fname] if fname =~ /^[A-F0-9]{32}(\-[A-Z0-9]{32})?(\.bdt)?\.gz$/
# gzipped inis
all_files += [fname] if fname =~ /\.ini\.gz$/
# old binaries/inis (v1 format):
all_files += [fname] if fname =~ /^[a-f0-9]{8}\.bin$/
all_files += [fname] if fname =~ /\.ini$/
}
files_to_delete = all_files - files_to_hold
files_to_delete.each{|f|
nothrow {
vprint "DELETE: " + target + '/' + f
File.delete(target + '/' + f)
}
}
end
def main(argv)
begin
if $show_version
print "#{my_id}\n"
return 0
end
if $print_list
uri = argv[0]
target = nil
projects = argv[1..-1]
else
uri = argv[0]
target = argv[1]
projects = argv[2..-1]
end
if uri.nil?
print "ERROR: no uri"
short_usage
end
if target.nil?
print "ERROR: no target"
short_usage
else
Dir.mkdir(target) if !FileTest.exists?(target)
raise "ERROR: #{target} isn't writable" if !FileTest.writable?(target)
end if $print_list.nil?
if projects.nil? and $sync_all.nil?
print "ERROR: no projects"
short_usage
end
# TODO: check params validity
repo_info = get_repo_info(uri, projects)
vprint ""
vprint "/" + "-"*50
vprint "| remote repository = #{ uri }"
vprint "| target dir = #{ target }"
vprint "| projects to syn = #{ repo_info['projects'].join(' ') }" if !$print_list
vprint "\\" + "-"*50
vprint ""
if $print_list
show_file_list(uri, repo_info['files'].merge(repo_info['ini']).sort)
else
fetch_needed_files(uri, target, repo_info['files'])
hold_files = repo_info['files'].keys
hold_files += repo_info['ini'].keys
#strip .gz part
hold_files += repo_info['ini'].keys.collect{|e| e[0..-4]} if $compat_v1
if $compat_v1
hold_files += repo_info['files'].map{|f, a| a['unpacked']}.compact
end
if $prefetch
vprint "PREFETCH: Done. Keep metadata unmodified."
else
store_inis(target, repo_info['ini'])
cleanup_repository(target, hold_files)
end
end
rescue => e
eprint "update failed: #{e}"
vprint e.backtrace.join "\n"
return 1
end
return 0
end
exit main(ARGV)