blob: bdae9a315e7edb7d235a13fb6c1e3539383dce84 [file] [log] [blame]
Joshua Littc88f1ec2013-11-11 12:31:42 -08001#
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002# Copyright (c) 2016, Alliance for Open Media. All rights reserved
3#
4# This source code is subject to the terms of the BSD 2 Clause License and
5# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6# was not distributed with this source code in the LICENSE file, you can
7# obtain it at www.aomedia.org/license/software. If the Alliance for Open
8# Media Patent License 1.0 was not distributed with this source code in the
9# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Joshua Littc88f1ec2013-11-11 12:31:42 -080010#
11# This simple script pulls test files from the webm homepage
12# It is intelligent enough to only pull files if
13# 1) File / test_data folder does not exist
14# 2) SHA mismatch
15
16import pycurl
17import csv
18import hashlib
19import re
20import os.path
21import time
22import itertools
23import sys
24import getopt
25
26#globals
27url = ''
28file_list_path = ''
29local_resource_path = ''
30
31# Helper functions:
32# A simple function which returns the sha hash of a file in hex
33def get_file_sha(filename):
34 try:
35 sha_hash = hashlib.sha1()
36 with open(filename, 'rb') as file:
37 buf = file.read(HASH_CHUNK)
38 while len(buf) > 0:
39 sha_hash.update(buf)
40 buf = file.read(HASH_CHUNK)
41 return sha_hash.hexdigest()
42 except IOError:
43 print "Error reading " + filename
44
45# Downloads a file from a url, and then checks the sha against the passed
46# in sha
47def download_and_check_sha(url, filename, sha):
48 path = os.path.join(local_resource_path, filename)
49 fp = open(path, "wb")
50 curl = pycurl.Curl()
51 curl.setopt(pycurl.URL, url + "/" + filename)
52 curl.setopt(pycurl.WRITEDATA, fp)
53 curl.perform()
54 curl.close()
55 fp.close()
56 return get_file_sha(path) == sha
57
58#constants
59ftp_retries = 3
60
61SHA_COL = 0
62NAME_COL = 1
63EXPECTED_COL = 2
64HASH_CHUNK = 65536
65
66# Main script
67try:
68 opts, args = \
69 getopt.getopt(sys.argv[1:], \
70 "u:i:o:", ["url=", "input_csv=", "output_dir="])
71except:
72 print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
73 sys.exit(2)
74
75for opt, arg in opts:
76 if opt == '-u':
77 url = arg
78 elif opt in ("-i", "--input_csv"):
79 file_list_path = os.path.join(arg)
80 elif opt in ("-o", "--output_dir"):
81 local_resource_path = os.path.join(arg)
82
83if len(sys.argv) != 7:
84 print "Expects two paths and a url!"
85 exit(1)
86
87if not os.path.isdir(local_resource_path):
88 os.makedirs(local_resource_path)
89
90file_list_csv = open(file_list_path, "rb")
91
92# Our 'csv' file uses multiple spaces as a delimiter, python's
93# csv class only uses single character delimiters, so we convert them below
94file_list_reader = csv.reader((re.sub(' +', ' ', line) \
95 for line in file_list_csv), delimiter = ' ')
96
97file_shas = []
98file_names = []
99
100for row in file_list_reader:
101 if len(row) != EXPECTED_COL:
102 continue
103 file_shas.append(row[SHA_COL])
104 file_names.append(row[NAME_COL])
105
106file_list_csv.close()
107
108# Download files, only if they don't already exist and have correct shas
109for filename, sha in itertools.izip(file_names, file_shas):
110 path = os.path.join(local_resource_path, filename)
111 if os.path.isfile(path) \
112 and get_file_sha(path) == sha:
113 print path + ' exists, skipping'
114 continue
115 for retry in range(0, ftp_retries):
116 print "Downloading " + path
117 if not download_and_check_sha(url, filename, sha):
118 print "Sha does not match, retrying..."
119 else:
120 break