hash-files.py (3573B)
1 #!/usr/bin/env python3 2 3 import os 4 import sys 5 import subprocess 6 7 def debug(s): 8 #print(s, file=sys.stderr, flush=True) 9 pass 10 11 12 def hashFile(filename): 13 result = subprocess.check_output(['sha256sum', '--binary', '--zero', filename])[0:64] 14 debug("hashFile("+filename+") = "+str(result)) 15 return result 16 17 def hash1(bytes_): 18 result = subprocess.check_output(['sha256sum', '--binary', '--zero'], input=bytes_)[0:64] 19 debug("hash1("+str(bytes_)+") = "+str(result)) 20 return result 21 22 # 23 # TODO: use this to get the hashes and names for all roots of the DAG (commits that are reachable only through one (or several) direct branch names, but not transitively as ancestors of other commits) 24 # 25 git_command=''' 26 ( 27 ( 28 git log --format=%P --all {HEAD} {FETCH_HEAD} | tr ' ' \\n | grep -v '^$' | LC_ALL=C sort -u | sed -e 'p;p'; 29 git rev-parse {HEAD} {FETCH_HEAD} --all | LC_ALL=C sort -u 30 ) | LC_ALL=C sort | uniq -u; 31 for ref in {HEAD} {FETCH_HEAD}; do echo "$(git rev-parse $ref) $ref"; done; git for-each-ref --format='%(objectname) %(refname)' 32 ) | LC_ALL=C sort \ 33 | awk 'BEGIN {{ h="" }} {{ if (length($0) == 40) {{ h=$0 }} else {{ if (substr($0,1,40) == h) print $0 }} }}' \ 34 | LC_ALL=C sort -k 2 35 ''' 36 37 def ref_exists(path, ref): 38 try: 39 subprocess.check_output("git rev-parse --verify "+ref+" 2>/dev/null", cwd=path, shell=True) 40 return True 41 except subprocess.CalledProcessError: 42 return False 43 44 def hashGit(path): 45 FETCH_HEAD = "FETCH_HEAD" if ref_exists(path, "FETCH_HEAD") else '' 46 HEAD = "HEAD" if ref_exists(path, "HEAD") else '' 47 result = subprocess.check_output(['sh', '-c', git_command.format(HEAD=HEAD, FETCH_HEAD=FETCH_HEAD)], cwd=path) 48 debug("hashGit("+path+") = "+str(result)) 49 return result 50 51 def hashSqlite3(path): 52 result= subprocess.check_output(['sh', '-c', 'sqlite3 "$1" .dump | LC_ALL=C sort | sha256sum --binary --zero', '--', os.path.abspath(path)]) 53 debug("hashSqlite3("+path+") = "+str(result)) 54 return result 55 56 def ignore_exitcode(cmd, **kwargs): 57 try: 58 return subprocess.check_output(cmd, **kwargs) 59 except subprocess.CalledProcessError: 60 return '' 61 62 def is_git(x): 63 return os.path.isdir(x) \ 64 and (ignore_exitcode("git rev-parse --is-inside-git-dir 2>/dev/null", cwd=x, shell=True).strip() == b'true' or 65 ignore_exitcode("git rev-parse --is-inside-work-tree 2>/dev/null", cwd=x, shell=True).strip() == b'true') 66 # TODO: if a file which is inside a larger git dir is passed on the CLI, this still returns True :-( 67 68 def recur(depth, x): 69 # initial list of paths 70 if isinstance(x, list): 71 debug("ROOT " + str(depth) + ' [' + ', '.join(x) + ']') 72 return b'root\0' + b''.join(recur(depth + 1, os.path.abspath(path)) + b' ' + path.encode('utf-8') + b'\0' for path in sorted(x)) 73 # GIT repo 74 elif is_git(x): 75 debug("GIT DIR " + str(depth) + ' ' + x) 76 return hash1(b'git-versioned folder\0' + hashGit(x)) 77 # directory 78 elif os.path.isdir(x): 79 debug("DIR " + str(depth) + ' ' + x) 80 return hash1(b'directory\0' + b''.join(recur(depth + 1, os.path.join(x, entry)) + b' ' + entry.encode('utf-8') + b'\0' for entry in sorted(os.listdir(x)))) 81 elif b'SQLite 3.x database' in subprocess.check_output(["file", x]): 82 debug("SQLITE3 " + str(depth) + ' ' + x) 83 return hashSqlite3(x) 84 # Just a file 85 elif os.path.isfile(x): 86 debug("PLAIN FILE " + str(depth) + ' ' + x) 87 return hashFile(x) 88 else: 89 sys.exit("unknown file type for %s" % os.path.abspath(x)) 90 91 print(hash1(recur(0, sys.argv[1:])).decode('utf-8'), flush=True)