From 83fd3510cd5cd374a8df51b53b1542d2b9086d41 Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 18 Jul 2016 04:36:24 -0500 Subject: [PATCH] Split up shredder into a class. Added support for >1000 items --- requirements.txt | 1 + shreddit.yml.example | 4 + shreddit/app.py | 6 +- shreddit/shredder.py | 296 ++++++++++++++++++++++++------------------- 4 files changed, 174 insertions(+), 133 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1f3ff5c..560f4fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ six==1.10.0 tornado==4.3 update-checker==0.11 wheel==0.24.0 +arrow diff --git a/shreddit.yml.example b/shreddit.yml.example index f23bf0d..96365ad 100644 --- a/shreddit.yml.example +++ b/shreddit.yml.example @@ -68,4 +68,8 @@ save_directory: /tmp # options: [random, dot, "user entered string"] replacement_format: random +# Batch cooldown +# This controls how long (in seconds) to wait between each set of 1000 deletions. +batch_cooldown: 10 + # vim: syntax=yaml ts=2 diff --git a/shreddit/app.py b/shreddit/app.py index e00e17c..3ada978 100644 --- a/shreddit/app.py +++ b/shreddit/app.py @@ -2,8 +2,9 @@ """ import argparse import yaml +import logging from shreddit.oauth import oauth_test -from shreddit.shredder import shred +from shreddit.shredder import Shredder def main(): @@ -22,7 +23,8 @@ def main(): if not config: raise Exception("No config options passed!") - shred(config, args.praw) + shredder = Shredder(config, args.praw) + shredder.shred() if __name__ == "__main__": diff --git a/shreddit/shredder.py b/shreddit/shredder.py index 4408e70..242c083 100644 --- a/shreddit/shredder.py +++ b/shreddit/shredder.py @@ -3,8 +3,10 @@ import sys import logging import argparse import json +import arrow import yaml import praw +import time from re import sub from datetime import datetime, timedelta from praw.errors import (InvalidUser, InvalidUserPass, RateLimitExceeded, HTTPException, OAuthAppRequired) @@ -12,137 +14,169 @@ from praw.objects import Comment, Submission from shreddit.util import get_sentence -def shred(config, praw_ini=None): - logging.basicConfig(stream=sys.stdout) - log = logging.getLogger("shreddit") - log.setLevel(level=logging.WARNING) - - if praw_ini: - # PRAW won't panic if the file is invalid, so check first - if not os.path.exists(praw_ini): - print("PRAW configuration file \"{}\" not found.".format(praw_ini)) - return - praw.settings.CONFIG.read(praw_ini) - - save_directory = config.get("save_directory", ".") - - r = praw.Reddit(user_agent="shreddit/4.2") - if save_directory: - r.config.store_json_result = True - - if config.get("verbose", True): - log.setLevel(level=logging.DEBUG) - - try: - # Try to login with OAuth2 - r.refresh_access_information() - log.debug("Logged in with OAuth.") - except (HTTPException, OAuthAppRequired) as e: - log.warning("You should migrate to OAuth2 using get_secret.py before Reddit disables this login method.") +class Shredder(object): + """This class stores state for configuration, API objects, logging, etc. It exposes a shred() method that + application code can call to start it. + """ + def __init__(self, config, praw_ini=None): + logging.basicConfig() + self._logger = logging.getLogger("shreddit") + self._logger.setLevel(level=logging.DEBUG if config.get("verbose", True) else logging.INFO) + self._logger.info(config) + + self._praw_ini = praw_ini + self._username, self._password = config["username"], config["password"] + self._connect(praw_ini, self._username, self._password) + + if config.get("save_directory", "."): + self._r.config.store_json_result = True + + # Read some information from the config and store it + # TODO: Handle this in a much cleaner way + self._whitelist = set(config.get("whitelist", [])) + self._whitelist_ids = set(config.get("whitelist_ids", [])) + self._item = config.get("item", "comments") + self._sort = config.get("sort", "new") + self._whitelist_dist = config.get("whitelist_distinguished", False) + self._whitelist_gild = config.get("whitelist_gilded", False) + self._max_score = config.get("max_score", None) + self._recent_cutoff = arrow.now().replace(hours=-config.get("hours", 24)) + self._nuke_cutoff = arrow.now().replace(hours=-config.get("nuke_hours", 4320)) + self._save = config.get("save_directory", None) + self._trial = config.get("trial_run", False) + self._clear_vote = config.get("clear_vote", False) + self._repl_format = config.get("replacement_format") + self._edit_only = config.get("edit_only", False) + self._batch_cooldown = config.get("batch_cooldown", 10) + if self._save: + if not os.path.exists(self._save): + os.makedirs(self._save) + self._limit = None + self._logger.info("Deleting ALL items before {}".format(self._nuke_cutoff)) + self._logger.info("Deleting items not whitelisted until {}".format(self._recent_cutoff)) + self._logger.info("Ignoring ALL items after {}".format(self._recent_cutoff)) + self._logger.info("Targeting {} sorted by {}".format(self._item, self._sort)) + if self._whitelist: + self._logger.info("Keeping items from subreddits {}".format(", ".join(self._whitelist))) + if self._save: + self._logger.info("Saving deleted items to: {}".format(self._save)) + if self._trial: + self._logger.info("Trial run - no deletion will be performed") + + def shred(self): + deleted = self._remove_things(self._get_things()) + if deleted >= 1000: + # This user has more than 1000 items to handle, which angers the gods of the Reddit API. So chill for a + # while and do it again. + self._logger.info("Finished deleting 1000 items. " \ + "Waiting {} seconds and continuing...".format(self._batch_cooldown)) + time.sleep(self._batch_cooldown) + self._connect(None, self._username, self._password) + self.shred() + + def _connect(self, praw_ini, username, password): + self._r = praw.Reddit(user_agent="shreddit/4.2") + if praw_ini: + # PRAW won't panic if the file is invalid, so check first + if not os.path.exists(praw_ini): + print("PRAW configuration file \"{}\" not found.".format(praw_ini)) + return + praw.settings.CONFIG.read(praw_ini) try: + # Try to login with OAuth2 + self._r.refresh_access_information() + self._logger.debug("Logged in with OAuth.") + except (HTTPException, OAuthAppRequired) as e: + self._logger.warning("You should migrate to OAuth2 using get_secret.py before Reddit disables this login " + "method.") try: - r.login(config["username"], config["password"]) - except InvalidUserPass: - r.login() # Supply details on the command line - except InvalidUser as e: - raise InvalidUser("User does not exist.", e) - except InvalidUserPass as e: - raise InvalidUserPass("Specified an incorrect password.", e) - except RateLimitExceeded as e: - raise RateLimitExceeded("You're doing that too much.", e) - - log.info("Logged in as {user}.".format(user=r.user)) - log.debug("Deleting messages before {time}.".format( - time=datetime.now() - timedelta(hours=config["hours"]))) - - whitelist = config.get("whitelist", []) - whitelist_ids = config.get("whitelist_ids", []) - - if config.get("whitelist"): - log.debug("Keeping messages from subreddits {subs}".format(subs=", ".join(whitelist))) - - remove_things(r, config, log, get_things(r, config, log)) - - -def get_things(r, config, log, after=None): - limit = None - item = config.get("item", "comments") - sort = config.get("sort", "new") - log.debug("Deleting items: {item}".format(item=item)) - if item == "comments": - return r.user.get_comments(limit=limit, sort=sort) - elif item == "submitted": - return r.user.get_submitted(limit=limit, sort=sort) - elif item == "overview": - return r.user.get_overview(limit=limit, sort=sort) - else: - raise Exception("Your deletion section is wrong") - - -def remove_things(r, config, log, things): - for thing in things: - log.debug("Starting remove function on: {thing}".format(thing=thing)) - # Seems to be in users's timezone. Unclear. - thing_time = datetime.fromtimestamp(thing.created_utc) - # Exclude items from being deleted unless past X hours. - after_time = datetime.now() - timedelta(hours=config.get("hours", 24)) - if thing_time > after_time: - if thing_time + timedelta(hours=config.get("nuke_hours", 4320)) < datetime.utcnow(): - pass - continue - # For edit_only we're assuming that the hours aren't altered. - # This saves time when deleting (you don't edit already edited posts). - if config.get("edit_only"): - end_time = after_time - timedelta(hours=config.get("hours", 24)) - if thing_time < end_time: - continue - - if str(thing.subreddit).lower() in config.get("whitelist", []) or thing.id in config.get("whitelist_ids", []): - continue - - if config.get("whitelist_distinguished") and thing.distinguished: - continue - if config.get("whitelist_gilded") and thing.gilded: - continue - if "max_score" in config and thing.score > config["max_score"]: - continue - - if config.get("save_directory"): - save_directory = config["save_directory"] - if not os.path.exists(save_directory): - os.makedirs(save_directory) - with open("%s/%s.json" % (save_directory, thing.id), "w") as fh: - json.dump(thing.json_dict, fh) - - if config.get("trial_run"): # Don't do anything, trial mode! - log.debug("Would have deleted {thing}: '{content}'".format( - thing=thing.id, content=thing)) - continue - - if config.get("clear_vote"): - thing.clear_vote() - - if isinstance(thing, Submission): - log.info("Deleting submission: #{id} {url}".format(id=thing.id, url=thing.url.encode("utf-8"))) - elif isinstance(thing, Comment): - rep_format = config.get("replacement_format") - if rep_format == "random": - replacement_text = get_sentence() - elif rep_format == "dot": - replacement_text = "." - else: - replacement_text = rep_format - - msg = '/r/{3}/ #{0} with:\n\t"{1}" to\n\t"{2}"'.format(thing.id, sub(b"\n\r\t", " ", - thing.body[:78].encode("utf-8")), - replacement_text[:78], thing.subreddit) - - if config.get("edit_only"): - log.info("Editing (not removing) {msg}".format(msg=msg)) + try: + self._r.login(username, password) + except InvalidUserPass: + self._r.login() # Supply details on the command line + except InvalidUser as e: + raise InvalidUser("User does not exist.", e) + except InvalidUserPass as e: + raise InvalidUserPass("Specified an incorrect password.", e) + except RateLimitExceeded as e: + raise RateLimitExceeded("You're doing that too much.", e) + self._logger.info("Logged in as {user}.".format(user=self._r.user)) + + def _check_item(self, item): + """Returns True if the item is whitelisted, False otherwise. + """ + if str(item.subreddit).lower() in self._whitelist or item.id in self._whitelist_ids: + return True + if self._whitelist_dist and item.distinguished: + return True + if self._whitelist_gild and item.gilded: + return True + if self._max_score is not None and item.score > self._max_score: + return True + return False + + def _save_item(self, item): + with open(os.path.join(self._save, item.id), "w") as fh: + json.dump(item.json_dict, fh) + + def _remove_submission(self, sub): + self._logger.info("Deleting submission: #{id} {url}".format(id=sub.id, url=sub.url.encode("utf-8"))) + + def _remove_comment(self, comment): + if self._repl_format == "random": + replacement_text = get_sentence() + elif self._repl_format == "dot": + replacement_text = "." + else: + replacement_text = self._repl_format + + short_text = sub(b"\n\r\t", " ", comment.body[:35].encode("utf-8")) + msg = "/r/{}/ #{} ({}) with: {}".format(comment.subreddit, comment.id, short_text, replacement_text) + + if self._edit_only: + self._logger.info("Editing (not removing) {msg}".format(msg=msg)) + else: + self._logger.info("Editing and deleting {msg}".format(msg=msg)) + if not self._trial: + comment.edit(replacement_text) + + def _remove(self, item): + if self._save: + self._save_item(item) + if self._clear_vote: + item.clear_vote() + if isinstance(item, Submission): + self._remove_submission(item) + elif isinstance(item, Comment): + self._remove_comment(item) + if not self._edit_only and not self._trial: + item.delete() + + def _remove_things(self, items): + for idx, item in enumerate(items): + self._logger.debug("Examining: {}".format(item)) + created = arrow.get(item.created_utc) + if created <= self._nuke_cutoff: + self._logger.debug("Item occurs prior to nuke cutoff") + self._remove(item) + elif created > self._recent_cutoff: + self._logger.debug("Skipping due to: too recent") + continue + elif self._check_item(item): + self._logger.debug("Skipping due to: whitelisted") + continue else: - log.info("Editing and deleting {msg}".format(msg=msg)) - - thing.edit(replacement_text) - if not config.get("edit_only"): - thing.delete() - + self._remove(item) + if not idx % 10: + self._logger.info("{} items handled.".format(idx + 1)) + return idx + 1 + + def _get_things(self): + if self._item == "comments": + return self._r.user.get_comments(limit=self._limit, sort=self._sort) + elif self._item == "submitted": + return self._r.user.get_submitted(limit=self._limit, sort=self._sort) + elif self._item == "overview": + return self._r.user.get_overview(limit=self._limit, sort=self._sort) + else: + raise Exception("Your deletion section is wrong")