From 0308e0260891159526480c952ee1afbda5479e0b Mon Sep 17 00:00:00 2001 From: David Trail Date: Fri, 16 Mar 2012 16:42:27 +0100 Subject: [PATCH] Initial obtaining of user data. --- .gitignore | 1 + README.rst | 6 +++++- shreddit.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100755 shreddit.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2d46485 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +data.json diff --git a/README.rst b/README.rst index bc6b11a..3ba56d9 100644 --- a/README.rst +++ b/README.rst @@ -6,8 +6,12 @@ Details When one deletes their account on Reddit it does nothing with their comment history other than obscure the author (replaces with [deleted]) which is not good enough for some people. +Usage +----------- +python2 shreddit.py UserName + Caveats ----------- +----------- - Only your previous 1,000 comments are accessable on Reddit. So good luck deleting the others. There may be ways to hack around this via iterating using sorting by top/best/controversial/new but for now I am unsure. - Would make life easier if Reddit just did a "DELETE * FROM abc_def WHERE user_id = 1337" diff --git a/shreddit.py b/shreddit.py new file mode 100755 index 0000000..6392c34 --- /dev/null +++ b/shreddit.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python2 + +import sys +from json import loads, dumps +from urllib2 import urlopen +from time import sleep + +if len(sys.argv) != 2: + raise Exception("You must specifiy a user") + +user = sys.argv[1] + +sub_section = 'comments' +after = '' + +init_url = 'http://www.reddit.com/user/{user}/comments/.json?after=%s'.format(user=user) +next_url = init_url % after + +http = urlopen(next_url).read() +json = loads(http) + +datum = [] +while True: + print "Grabing IDs for after: ", after + after = json['data']['after'] + children = json['data']['children'] + + # This bit fills datum with the id (for removal) and the date (for saving recent posts) + for child in children: + child_data = child['data'] + if 'id' in child_data: + datum.append({'id': child_data[u'id'], 'date': child_data['created_utc']}) + + if after == None: + break + + next_url = init_url % after + http = urlopen(next_url).read() + json = loads(http) + sleep(2) # don't want to hammer reddit to hard + +print "Script collected all available data." + +f = open('data.json', 'w') +f.write(dumps(datum)) +f.close()