Mercurial > if-archive-json-mirror
annotate generate_index.py @ 2:27b872aee1a7
Added mirror.py.
author | Atul Varma <varmaa@toolness.com> |
---|---|
date | Tue, 27 May 2008 23:45:16 -0700 |
parents | 30b149f2cdf1 |
children |
rev | line source |
---|---|
0
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
1 import elementtree.ElementTree as ET |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
2 import urllib2 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
3 import re |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
4 import os |
2 | 5 import cPickle |
0
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
6 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
7 import json |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
8 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
9 ZCODE_REGEXP = r".*\.z([1-8]|blorb)$" |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
10 EXCLUDED_DIRS = ["if-archive/infocom", |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
11 "if-archive/solutions", |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
12 "if-archive/starters"] |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
13 EXCLUDED_DIRS_REGEXP = "|".join(EXCLUDED_DIRS) |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
14 INDEX_URL = "http://www.ifarchive.org/indexes/Master-Index.xml" |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
15 XML_FILENAME = "Master-Index.xml" |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
16 JSON_FILENAME = "if-archive.js" |
2 | 17 PICKLE_FILENAME = "if-archive.pickle" |
0
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
18 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
19 if __name__ == "__main__": |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
20 if not os.path.exists(XML_FILENAME): |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
21 print "Fetching %s." % INDEX_URL |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
22 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
23 data = urllib2.urlopen(INDEX_URL).read() |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
24 open(XML_FILENAME, "w").write(data) |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
25 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
26 print "Scanning files." |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
27 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
28 page = ET.ElementTree(file=XML_FILENAME) |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
29 files = page.findall("file") |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
30 zfiles = [ |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
31 filenode for filenode in files |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
32 if re.match(ZCODE_REGEXP, filenode.find("name").text) and |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
33 filenode.find("description") is not None and |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
34 not re.match(EXCLUDED_DIRS_REGEXP, filenode.find("path").text) |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
35 ] |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
36 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
37 dicts = [] |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
38 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
39 print "Writing %s" % JSON_FILENAME |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
40 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
41 for filenode in zfiles: |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
42 desc = filenode.find("description").text.strip() |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
43 if desc[-1] == ")": |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
44 desc = desc[:desc.rindex("(")] |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
45 elif desc[-1] == "]": |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
46 desc = desc[:desc.rindex("[")] |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
47 dicts.append( |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
48 {"path" : filenode.find("path").text, |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
49 "desc" : desc.encode("utf-8")} |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
50 ) |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
51 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
52 def getdesc(a): |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
53 return a["desc"] |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
54 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
55 dicts.sort(key = getdesc) |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
56 |
30b149f2cdf1
Origination, copied over from http://code.google.com/p/parchment/ revision 32.
Atul Varma <varmaa@toolness.com>
parents:
diff
changeset
|
57 open(JSON_FILENAME, "w").write("stories = " + json.write(dicts)) |
2 | 58 |
59 print "Writing %s" % PICKLE_FILENAME | |
60 cPickle.dump(dicts, open(PICKLE_FILENAME, "w")) |