Package pyrocore :: Package util :: Module metafile
[hide private]
[frames] | no frames]

Source Code for Module pyrocore.util.metafile

  1  """ Metafile Support. 
  2   
  3      Copyright (c) 2009, 2010, 2011 The PyroScope Project <pyroscope.project@gmail.com> 
  4  """ 
  5  # This program is free software; you can redistribute it and/or modify 
  6  # it under the terms of the GNU General Public License as published by 
  7  # the Free Software Foundation; either version 2 of the License, or 
  8  # (at your option) any later version. 
  9  # 
 10  # This program is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  # 
 15  # You should have received a copy of the GNU General Public License along 
 16  # with this program; if not, write to the Free Software Foundation, Inc., 
 17  # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 
 18  from __future__ import with_statement 
 19   
 20  import re 
 21  import sys 
 22  import time 
 23  import stat 
 24  import math 
 25  import errno 
 26  import pprint 
 27  import fnmatch 
 28  import hashlib 
 29  import urlparse 
 30  from contextlib import closing 
 31  
 
 32  from pyrobase import bencode 
 33  from pyrobase.parts import Bunch 
 34  from pyrocore import config, error 
 35  from pyrocore.util import os, fmt, pymagic 
 36   
 37   
 38  # Allowed characters in a metafile filename or path 
 39  ALLOWED_NAME = re.compile(r"^[^/\\.~][^/\\]*$") 
 40   
 41  # Character sequences considered secret (roughly, any path part or query parameter 
 42  # that looks like an alphanumeric sequence or url-safe base64 string) 
 43  PASSKEY_RE = re.compile(r"(?<=[/=])[-_0-9a-zA-Z]{5,64}={0,3}(?=[/&]|$)") 
 44   
 45  # Non-secret exemptions 
 46  PASSKEY_OK = ("announce", "TrackerServlet",) 
 47   
 48  # List of all standard keys in a metafile 
 49  METAFILE_STD_KEYS = [i.split('.') for i in ( 
 50      "announce", 
 51      "comment", 
 52      "created by", 
 53      "creation date", 
 54      "encoding", 
 55      "info", 
 56      "info.length", 
 57      "info.name", 
 58      "info.piece length", 
 59      "info.pieces", 
 60      "info.private", 
 61      "info.files", 
 62      "info.files.length", 
 63      "info.files.path", 
 64  )] 
 65   
 66  del i 
 67   
 68   
69 -def console_progress():
70 """ Return a progress indicator for consoles if 71 stdout is a tty. 72 """ 73 def progress(totalhashed, totalsize): 74 msg = " " * 30 75 if totalhashed < totalsize: 76 msg = "%5.1f%% complete" % (totalhashed * 100.0 / totalsize) 77 sys.stdout.write(msg + " \r") 78 sys.stdout.flush()
79 80 try: 81 return progress if sys.stdout.isatty() else None 82 except AttributeError: 83 return None 84 85
86 -def mask_keys(announce_url):
87 """ Mask any passkeys (hex sequences) in an announce URL. 88 """ 89 return PASSKEY_RE.sub( 90 lambda m: m.group() if m.group() in PASSKEY_OK else "*" * len(m.group()), 91 announce_url)
92 93
94 -class MaskingPrettyPrinter(pprint.PrettyPrinter):
95 """ A PrettyPrinter that masks strings in the object tree. 96 """ 97
98 - def format(self, obj, context, maxlevels, level):
99 """ Mask obj if it looks like an URL, then pass it to the super class. 100 """ 101 if isinstance(obj, basestring) and "://" in obj: 102 obj = mask_keys(obj) 103 return pprint.PrettyPrinter.format(self, obj, context, maxlevels, level)
104 105
106 -def check_info(info):
107 """ Validate info dict. 108 109 Raise ValueError if validation fails. 110 """ 111 if not isinstance(info, dict): 112 raise ValueError("bad metainfo - not a dictionary") 113 114 pieces = info.get("pieces") 115 if not isinstance(pieces, basestring) or len(pieces) % 20 != 0: 116 raise ValueError("bad metainfo - bad pieces key") 117 118 piece_size = info.get("piece length") 119 if not isinstance(piece_size, (int, long)) or piece_size <= 0: 120 raise ValueError("bad metainfo - illegal piece length") 121 122 name = info.get("name") 123 if not isinstance(name, basestring): 124 raise ValueError("bad metainfo - bad name (type is %r)" % type(name).__name__) 125 if not ALLOWED_NAME.match(name): 126 raise ValueError("name %s disallowed for security reasons" % name) 127 128 if info.has_key("files") == info.has_key("length"): 129 raise ValueError("single/multiple file mix") 130 131 if info.has_key("length"): 132 length = info.get("length") 133 if not isinstance(length, (int, long)) or length < 0: 134 raise ValueError("bad metainfo - bad length") 135 else: 136 files = info.get("files") 137 if not isinstance(files, (list, tuple)): 138 raise ValueError("bad metainfo - bad file list") 139 140 for item in files: 141 if not isinstance(item, dict): 142 raise ValueError("bad metainfo - bad file value") 143 144 length = item.get("length") 145 if not isinstance(length, (int, long)) or length < 0: 146 raise ValueError("bad metainfo - bad length") 147 148 path = item.get("path") 149 if not isinstance(path, (list, tuple)) or not path: 150 raise ValueError("bad metainfo - bad path") 151 152 for part in path: 153 if not isinstance(part, basestring): 154 raise ValueError("bad metainfo - bad path dir") 155 if not ALLOWED_NAME.match(part): 156 raise ValueError("path %s disallowed for security reasons" % part) 157 158 file_paths = [os.sep.join(item["path"]) for item in files] 159 if len(set(file_paths)) != len(file_paths): 160 raise ValueError("bad metainfo - duplicate path") 161 162 return info
163 164
165 -def check_meta(meta):
166 """ Validate meta dict. 167 168 Raise ValueError if validation fails. 169 """ 170 if not isinstance(meta, dict): 171 raise ValueError("bad metadata - not a dictionary") 172 if not isinstance(meta.get("announce"), basestring): 173 raise ValueError("bad announce URL - not a string") 174 check_info(meta.get("info")) 175 176 return meta
177 178
179 -def clean_meta(meta, including_info=False, logger=None):
180 """ Clean meta dict. Optionally log changes using the given logger. 181 182 @param logger: If given, a callable accepting a string message. 183 @return: Set of keys removed from C{meta}. 184 """ 185 modified = set() 186 187 for key in meta.keys(): 188 if [key] not in METAFILE_STD_KEYS: 189 if logger: 190 logger("Removing key %r..." % (key,)) 191 del meta[key] 192 modified.add(key) 193 194 if including_info: 195 for key in meta["info"].keys(): 196 if ["info", key] not in METAFILE_STD_KEYS: 197 if logger: 198 logger("Removing key %r..." % ("info." + key,)) 199 del meta["info"][key] 200 modified.add("info." + key) 201 202 for idx, entry in enumerate(meta["info"].get("files", [])): 203 for key in entry.keys(): 204 if ["info", "files", key] not in METAFILE_STD_KEYS: 205 if logger: 206 logger("Removing key %r from file #%d..." % (key, idx + 1)) 207 del entry[key] 208 modified.add("info.files." + key) 209 210 return modified
211 212
213 -def sanitize(meta):
214 """ Try to fix common problems, especially transcode non-standard string encodings. 215 """ 216 def sane_encoding(text): 217 "Transcoding helper." 218 for encoding in ('utf-8', meta.get('encoding', None), 'cp1252'): 219 if encoding: 220 try: 221 return text.decode(encoding).encode("utf-8") 222 except UnicodeError: 223 continue 224 else: 225 # Broken beyond anything reasonable 226 return unicode(text, 'utf-8', 'replace').replace(u'\ufffd', '_').encode("utf-8")
227 228 # Go through all string fields and check them 229 for field in ("comment", "created by"): 230 if field in meta: 231 meta[field] = sane_encoding(meta[field]) 232 233 meta["info"]["name"] = sane_encoding(meta["info"]["name"]) 234 235 for entry in meta["info"].get("files", []): 236 entry["path"] = [sane_encoding(i) for i in entry["path"]] 237 238 return meta 239 240
241 -def assign_fields(meta, assignments):
242 """ Takes a list of C{key=value} strings and 243 assigns them to the given metafile. 244 245 If just a key name is given (no '='), the field is removed. 246 """ 247 for assignment in assignments: 248 try: 249 if '=' in assignment: 250 field, val = assignment.split('=', 1) 251 else: 252 field, val = assignment, None 253 254 if val and val[0] in "+-" and val[1:].isdigit(): 255 val = int(val, 10) 256 257 # TODO: Allow numerical indices, and "+" for append 258 namespace = meta 259 for key in field.split('.')[:-1]: 260 # Create missing dicts as we go... 261 namespace = namespace.setdefault(key, {}) 262 except (KeyError, IndexError, TypeError, ValueError), exc: 263 raise error.UserError("Bad assignment %r (%s)!" % (assignment, exc)) 264 else: 265 if val is None: 266 del namespace[field.split('.')[-1]] 267 else: 268 namespace[field.split('.')[-1]] = val 269 270 return meta
271 272
273 -def add_fast_resume(meta, datapath):
274 """ Add fast resume data to a metafile dict. 275 """ 276 # Get list of files 277 files = meta["info"].get("files", None) 278 single = files is None 279 if single: 280 if os.path.isdir(datapath): 281 datapath = os.path.join(datapath, meta["info"]["name"]) 282 files = [Bunch( 283 path=[os.path.abspath(datapath)], 284 length=meta["info"]["length"], 285 )] 286 287 # Prepare resume data 288 resume = meta.setdefault("libtorrent_resume", {}) 289 resume["bitfield"] = len(meta["info"]["pieces"]) // 20 290 resume["files"] = [] 291 piece_length = meta["info"]["piece length"] 292 offset = 0 293 294 for fileinfo in files: 295 # Get the path into the filesystem 296 filepath = os.sep.join(fileinfo["path"]) 297 if not single: 298 filepath = os.path.join(datapath, filepath) 299 300 # Check file size 301 if os.path.getsize(filepath) != fileinfo["length"]: 302 raise OSError(errno.EINVAL, "File size mismatch for %r [is %d, expected %d]" % ( 303 filepath, os.path.getsize(filepath), fileinfo["length"], 304 )) 305 306 # Add resume data for this file 307 resume["files"].append(dict( 308 priority=1, 309 mtime=int(os.path.getmtime(filepath)), 310 completed=(offset+fileinfo["length"]+piece_length-1) // piece_length 311 - offset // piece_length, 312 )) 313 offset += fileinfo["length"] 314 315 return meta
316 317
318 -def info_hash(metadata):
319 """ Return info hash as a string. 320 """ 321 return hashlib.sha1(bencode.bencode(metadata['info'])).hexdigest().upper()
322 323
324 -def data_size(metadata):
325 """ Calculate the size of a torrent based on parsed metadata. 326 """ 327 info = metadata['info'] 328 329 if info.has_key('length'): 330 # Single file 331 total_size = info['length'] 332 else: 333 # Directory structure 334 total_size = sum([f['length'] for f in info['files']]) 335 336 return total_size
337 338
339 -class Metafile(object):
340 """ A torrent metafile. 341 """ 342 343 # Patterns of names to ignore 344 IGNORE_GLOB = [ 345 "core", "CVS", ".*", "*~", "*.swp", "*.tmp", "*.bak", 346 "[Tt]humbs.db", "[Dd]esktop.ini", "ehthumbs_vista.db", 347 ] 348 349
350 - def __init__(self, filename, datapath=None):
351 """ Initialize metafile. 352 """ 353 self.filename = filename 354 self.progress = None 355 self.datapath = datapath 356 self.ignore = self.IGNORE_GLOB[:] 357 self.LOG = pymagic.get_class_logger(self)
358 359
360 - def _get_datapath(self):
361 """ Get a valid datapath, else raise an exception. 362 """ 363 if self._datapath is None: 364 raise OSError(errno.ENOENT, "You didn't provide any datapath for %r" % self.filename) 365 366 return self._datapath
367
368 - def _set_datapath(self, datapath):
369 """ Set a datapath. 370 """ 371 if datapath: 372 self._datapath = datapath.rstrip(os.sep) 373 self._fifo = int(stat.S_ISFIFO(os.stat(self.datapath).st_mode)) 374 else: 375 self._datapath = None 376 self._fifo = False
377 378 datapath = property(_get_datapath, _set_datapath) 379 380
381 - def walk(self):
382 """ Generate paths in "self.datapath". 383 """ 384 # FIFO? 385 if self._fifo: 386 if self._fifo > 1: 387 raise RuntimeError("INTERNAL ERROR: FIFO read twice!") 388 self._fifo += 1 389 390 # Read paths relative to directory containing the FIFO 391 with closing(open(self.datapath, "r")) as fifo: 392 while True: 393 relpath = fifo.readline().rstrip('\n') 394 if not relpath: # EOF? 395 break 396 self.LOG.debug("Read relative path %r from FIFO..." % (relpath,)) 397 yield os.path.join(os.path.dirname(self.datapath), relpath) 398 399 self.LOG.debug("FIFO %r closed!" % (self.datapath,)) 400 401 # Directory? 402 elif os.path.isdir(self.datapath): 403 # Walk the directory tree 404 for dirpath, dirnames, filenames in os.walk(self.datapath): #, followlinks=True): 405 # Don't scan blacklisted directories 406 for bad in dirnames[:]: 407 if any(fnmatch.fnmatch(bad, pattern) for pattern in self.ignore): 408 dirnames.remove(bad) 409 410 # Yield all filenames that aren't blacklisted 411 for filename in filenames: 412 if not any(fnmatch.fnmatch(filename, pattern) for pattern in self.ignore): 413 #yield os.path.join(dirpath[len(self.datapath)+1:], filename) 414 yield os.path.join(dirpath, filename) 415 416 # Single file 417 else: 418 # Yield the filename 419 yield self.datapath
420 421
422 - def _calc_size(self):
423 """ Get total size of "self.datapath". 424 """ 425 return sum(os.path.getsize(filename) 426 for filename in self.walk() 427 )
428 429
430 - def _make_info(self, piece_size, progress, walker, piece_callback=None):
431 """ Create info dict. 432 """ 433 # These collect the file descriptions and piece hashes 434 file_list = [] 435 pieces = [] 436 437 # Initialize progress state 438 totalsize = -1 if self._fifo else self._calc_size() 439 totalhashed = 0 440 441 # Start a new piece 442 sha1 = hashlib.sha1() 443 done = 0 444 445 # Hash all files 446 for filename in walker: 447 # Assemble file info 448 filesize = os.path.getsize(filename) 449 filepath = filename[len(os.path.dirname(self.datapath) if self._fifo else self.datapath):].lstrip(os.sep) 450 file_list.append({ 451 "length": filesize, 452 "path": filepath.replace(os.sep, '/').split('/'), 453 }) 454 self.LOG.debug("Hashing %r, size %d..." % (filename, filesize)) 455 456 # Open file and hash it 457 fileoffset = 0 458 handle = open(filename, "rb") 459 try: 460 while fileoffset < filesize: 461 # Read rest of piece or file, whatever is smaller 462 chunk = handle.read(min(filesize - fileoffset, piece_size - done)) 463 sha1.update(chunk) 464 done += len(chunk) 465 fileoffset += len(chunk) 466 totalhashed += len(chunk) 467 468 # Piece is done 469 if done == piece_size: 470 pieces.append(sha1.digest()) 471 if piece_callback: 472 piece_callback(filename, pieces[-1]) 473 474 # Start a new piece 475 sha1 = hashlib.sha1() 476 done = 0 477 478 # Report progress 479 if progress: 480 progress(totalhashed, totalsize) 481 finally: 482 handle.close() 483 484 # Add hash of partial last piece 485 if done > 0: 486 pieces.append(sha1.digest()) 487 if piece_callback: 488 piece_callback(filename, pieces[-1]) 489 490 # Build the meta dict 491 metainfo = { 492 "pieces": "".join(pieces), 493 "piece length": piece_size, 494 "name": os.path.basename(self.datapath), 495 } 496 497 # Handle directory/FIFO vs. single file 498 if self._fifo or os.path.isdir(self.datapath): 499 metainfo["files"] = file_list 500 else: 501 metainfo["length"] = totalhashed 502 503 # Return validated info dict 504 return check_info(metainfo)
505 506
507 - def _make_meta(self, tracker_url, root_name, private, progress):
508 """ Create torrent dict. 509 """ 510 # Calculate piece size 511 if self._fifo: 512 # TODO we need to add a (command line) param, probably for total data size 513 # for now, always 1MB 514 piece_size_exp = 20 515 else: 516 total_size = self._calc_size() 517 if total_size: 518 piece_size_exp = int(math.log(total_size) / math.log(2)) - 9 519 else: 520 piece_size_exp = 0 521 522 piece_size_exp = min(max(15, piece_size_exp), 24) 523 piece_size = 2 ** piece_size_exp 524 525 # Build info hash 526 info = self._make_info(piece_size, progress, self.walk() if self._fifo else sorted(self.walk())) 527 528 # Enforce unique hash per tracker 529 info["x_cross_seed"] = hashlib.md5(tracker_url).hexdigest() 530 531 # Set private flag 532 if private: 533 info["private"] = 1 534 535 # Freely chosen root name (default is basename of the data path) 536 if root_name: 537 info["name"] = root_name 538 539 # Torrent metadata 540 meta = { 541 "info": info, 542 "announce": tracker_url.strip(), 543 } 544 545 #XXX meta["encoding"] = "UTF-8" 546 547 # Return validated meta dict 548 return check_meta(meta)
549 550
551 - def create(self, datapath, tracker_urls, comment=None, root_name=None, 552 created_by=None, private=False, no_date=False, progress=None, 553 callback=None):
554 """ Create a metafile with the path given on object creation. 555 Returns the last metafile dict that was written (as an object, not bencoded). 556 """ 557 if datapath: 558 self.datapath = datapath 559 560 try: 561 tracker_urls = ['' + tracker_urls] 562 except TypeError: 563 tracker_urls = list(tracker_urls) 564 multi_mode = len(tracker_urls) > 1 565 566 # TODO add optimization so the hashing happens only once for multiple URLs! 567 for tracker_url in tracker_urls: 568 # Lookup announce URLs from config file 569 try: 570 if urlparse.urlparse(tracker_url).scheme: 571 tracker_alias = urlparse.urlparse(tracker_url).netloc.split(':')[0].split('.') 572 tracker_alias = tracker_alias[-2 if len(tracker_alias) > 1 else 0] 573 else: 574 tracker_alias, tracker_url = config.lookup_announce_alias(tracker_url) 575 tracker_url = tracker_url[0] 576 except (KeyError, IndexError): 577 raise error.UserError("Bad tracker URL %r, or unknown alias!" % (tracker_url,)) 578 579 # Determine metafile name 580 output_name = self.filename 581 if multi_mode: 582 # Add 2nd level of announce URL domain to metafile name 583 output_name = list(os.path.splitext(output_name)) 584 try: 585 output_name[1:1] = '-' + tracker_alias 586 except (IndexError,): 587 self.LOG.error("Malformed announce URL %r, skipping!" % (tracker_url,)) 588 continue 589 output_name = ''.join(output_name) 590 591 # Hash the data 592 self.LOG.info("Creating %r for %s %r..." % ( 593 output_name, "filenames read from" if self._fifo else "data in", self.datapath, 594 )) 595 meta = self._make_meta(tracker_url, root_name, private, progress) 596 597 # Add optional fields 598 if comment: 599 meta["comment"] = comment 600 if created_by: 601 meta["created by"] = created_by 602 if not no_date: 603 meta["creation date"] = long(time.time()) 604 if callback: 605 callback(meta) 606 607 # Write metafile to disk 608 self.LOG.debug("Writing %r..." % (output_name,)) 609 bencode.bwrite(output_name, meta) 610 611 return meta
612 613
614 - def check(self, metainfo, datapath, progress=None):
615 """ Check piece hashes of a metafile against the given datapath. 616 """ 617 if datapath: 618 self.datapath = datapath 619 620 def check_piece(filename, piece): 621 "Callback for new piece" 622 if piece != metainfo["info"]["pieces"][check_piece.piece_index:check_piece.piece_index+20]: 623 self.LOG.warn("Piece #%d: Hashes differ in file %r" % (check_piece.piece_index//20, filename)) 624 check_piece.piece_index += 20
625 check_piece.piece_index = 0 626 627 datameta = self._make_info(int(metainfo["info"]["piece length"]), progress, 628 [datapath] if "length" in metainfo["info"] else 629 (os.path.join(*([datapath] + i["path"])) for i in metainfo["info"]["files"]), 630 piece_callback=check_piece 631 ) 632 return datameta["pieces"] == metainfo["info"]["pieces"]
633 634
635 - def listing(self, masked=True):
636 """ List torrent info & contents. Returns a list of formatted lines. 637 """ 638 # Assemble data 639 metainfo = sanitize(bencode.bread(self.filename)) 640 announce = metainfo['announce'] 641 info = metainfo['info'] 642 info_hash = hashlib.sha1(bencode.bencode(info)) 643 644 total_size = data_size(metainfo) 645 piece_length = info['piece length'] 646 piece_number, last_piece_length = divmod(total_size, piece_length) 647 648 # Build result 649 result = [ 650 "NAME %s" % (os.path.basename(self.filename)), 651 "SIZE %s (%i * %s + %s)" % ( 652 fmt.human_size(total_size).strip(), 653 piece_number, fmt.human_size(piece_length).strip(), 654 fmt.human_size(last_piece_length).strip(), 655 ), 656 "HASH %s" % (info_hash.hexdigest().upper()), 657 "URL %s" % (mask_keys if masked else str)(announce), 658 "PRV %s" % ("YES (DHT/PEX disabled)" if info.get("private") else "NO (DHT/PEX enabled)"), 659 "TIME %s" % ("N/A" if "creation date" not in metainfo else 660 time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(metainfo["creation date"])) 661 ), 662 ] 663 664 for label, key in (("BY ", "created by"), ("REM ", "comment")): 665 if key in metainfo: 666 result.append("%s %s" % (label, metainfo.get(key, "N/A"))) 667 668 result.extend([ 669 "", 670 "FILE LISTING", 671 ]) 672 if info.has_key('length'): 673 # Single file 674 result.append("%-69s%9s" % ( 675 info['name'], 676 fmt.human_size(total_size), 677 )) 678 else: 679 # Directory structure 680 result.append("%s/" % info['name']) 681 oldpaths = [None] * 99 682 for entry in info['files']: 683 for idx, item in enumerate(entry['path'][:-1]): 684 if item != oldpaths[idx]: 685 result.append("%s%s/" % (' ' * (4*(idx+1)), item)) 686 oldpaths[idx] = item 687 result.append("%-69s%9s" % ( 688 ' ' * (4*len(entry['path'])) + entry['path'][-1], 689 fmt.human_size(entry['length']), 690 )) 691 692 return result
693