2018-10-08 19:11:51 -06:00
#!/usr/bin/env python3
2018-10-24 20:37:11 -06:00
# toot downloader version two!!
2018-10-08 19:11:51 -06:00
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from mastodon import Mastodon
from os import path
from bs4 import BeautifulSoup
2018-10-24 20:37:11 -06:00
import os , sqlite3 , signal , sys , json , re
import requests
2018-10-08 19:11:51 -06:00
2018-10-28 22:36:21 -06:00
scopes = [ " read:statuses " , " read:accounts " , " read:follows " , " write:statuses " , " read:notifications " ]
2018-10-14 00:58:58 -06:00
cfg = json . load ( open ( ' config.json ' , ' r ' ) )
2018-10-08 19:11:51 -06:00
2019-01-11 05:05:29 -07:00
#config.json *MUST* contain the instance URL, and the CW text. if they're not provided, we'll fall back to defaults.
if ' site ' not in cfg :
cfg [ ' website ' ] = " https://botsin.space "
if ' cw ' not in cfg :
cfg [ ' cw ' ] = " "
#if the user is using a (very!) old version that still uses the .secret files, migrate to the new method
2018-10-24 20:37:11 -06:00
if os . path . exists ( " clientcred.secret " ) :
2018-10-27 02:28:20 -06:00
print ( " Upgrading to new storage method " )
cc = open ( " clientcred.secret " ) . read ( ) . split ( " \n " )
cfg [ ' client ' ] = {
" id " : cc [ 0 ] ,
" secret " : cc [ 1 ]
}
cfg [ ' secret ' ] = open ( " usercred.secret " ) . read ( ) . rstrip ( " \n " )
os . remove ( " clientcred.secret " )
os . remove ( " usercred.secret " )
2018-10-24 20:37:11 -06:00
if " client " not in cfg :
2019-01-11 05:08:10 -07:00
print ( " No application info -- registering application with {} " . format ( cfg [ ' site ' ] ) )
2018-10-24 20:37:11 -06:00
client_id , client_secret = Mastodon . create_app ( " mstdn-ebooks " ,
api_base_url = cfg [ ' site ' ] ,
scopes = scopes ,
website = " https://github.com/Lynnesbian/mstdn-ebooks " )
cfg [ ' client ' ] = {
" id " : client_id ,
" secret " : client_secret
}
if " secret " not in cfg :
2019-01-11 05:08:10 -07:00
print ( " No user credentials -- logging in to {} " . format ( cfg [ ' site ' ] ) )
2018-10-24 20:37:11 -06:00
client = Mastodon ( client_id = cfg [ ' client ' ] [ ' id ' ] ,
client_secret = cfg [ ' client ' ] [ ' secret ' ] ,
api_base_url = cfg [ ' site ' ] )
2018-10-08 19:11:51 -06:00
2019-01-11 05:08:10 -07:00
print ( " Open this URL and authenticate to give mstdn-ebooks access to your bot ' s account: {} " . format ( client . auth_request_url ( scopes = scopes ) ) )
2018-10-24 20:37:11 -06:00
cfg [ ' secret ' ] = client . log_in ( code = input ( " Secret: " ) , scopes = scopes )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
json . dump ( cfg , open ( " config.json " , " w+ " ) )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
def extract_toot ( toot ) :
toot = toot . replace ( " ' " , " ' " )
toot = toot . replace ( " " " , ' " ' )
soup = BeautifulSoup ( toot , " html.parser " )
2018-10-08 19:11:51 -06:00
# this is the code that removes all mentions
for mention in soup . select ( " span.h-card " ) :
2018-10-24 20:37:11 -06:00
mention . a . unwrap ( )
mention . span . unwrap ( )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
# replace <br> with linebreak
2018-10-08 19:11:51 -06:00
for lb in soup . select ( " br " ) :
lb . insert_after ( " \n " )
lb . decompose ( )
2018-10-24 20:37:11 -06:00
# replace <p> with linebreak
2018-10-08 19:11:51 -06:00
for p in soup . select ( " p " ) :
p . insert_after ( " \n " )
p . unwrap ( )
2018-10-24 20:37:11 -06:00
# fix hashtags
2018-10-08 19:11:51 -06:00
for ht in soup . select ( " a.hashtag " ) :
ht . unwrap ( )
2018-10-24 20:37:11 -06:00
# fix links
2018-10-08 19:11:51 -06:00
for link in soup . select ( " a " ) :
link . insert_after ( link [ " href " ] )
link . decompose ( )
2018-10-24 20:37:11 -06:00
toot = soup . get_text ( )
toot = toot . rstrip ( " \n " ) #remove trailing newline
2018-11-27 04:29:50 -07:00
toot = toot . replace ( " @ " , " @ \u200B " ) #put a zws between @ and username to avoid mentioning
2018-10-24 20:37:11 -06:00
return ( toot )
2018-10-08 19:11:51 -06:00
client = Mastodon (
2018-10-24 20:37:11 -06:00
client_id = cfg [ ' client ' ] [ ' id ' ] ,
client_secret = cfg [ ' client ' ] [ ' secret ' ] ,
access_token = cfg [ ' secret ' ] ,
api_base_url = cfg [ ' site ' ] )
2018-10-08 19:11:51 -06:00
me = client . account_verify_credentials ( )
following = client . account_following ( me . id )
db = sqlite3 . connect ( " toots.db " )
db . text_factory = str
c = db . cursor ( )
2018-10-24 20:37:11 -06:00
c . execute ( " CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID " )
2018-10-08 19:11:51 -06:00
db . commit ( )
def handleCtrlC ( signal , frame ) :
print ( " \n PREMATURE EVACUATION - Saving chunks " )
db . commit ( )
sys . exit ( 1 )
signal . signal ( signal . SIGINT , handleCtrlC )
2018-10-27 02:28:20 -06:00
def get_toots_legacy ( client , id ) :
i = 0
toots = client . account_statuses ( id )
while toots is not None and len ( toots ) > 0 :
for toot in toots :
if toot . spoiler_text != " " : continue
if toot . reblog is not None : continue
if toot . visibility not in [ " public " , " unlisted " ] : continue
t = extract_toot ( toot . content )
if t != None :
yield {
" toot " : t ,
" id " : toot . id ,
" uri " : toot . uri
}
toots = client . fetch_next ( toots )
i + = 1
if i % 20 == 0 :
print ( ' . ' , end = ' ' , flush = True )
2018-10-08 19:11:51 -06:00
for f in following :
last_toot = c . execute ( " SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1 " , ( f . id , ) ) . fetchone ( )
if last_toot != None :
last_toot = last_toot [ 0 ]
else :
last_toot = 0
2018-10-24 20:37:11 -06:00
print ( " Harvesting toots for user @ {} , starting from {} " . format ( f . acct , last_toot ) )
#find the user's activitypub outbox
2018-10-27 02:28:20 -06:00
print ( " WebFingering... " )
2018-10-24 20:37:11 -06:00
instance = re . search ( r " ^.*@(.+) " , f . acct )
if instance == None :
instance = re . search ( r " https?: \ / \ /(.*) " , cfg [ ' site ' ] ) . group ( 1 )
else :
instance = instance . group ( 1 )
2018-10-25 08:33:57 -06:00
if instance == " bofa.lol " :
print ( " rest in piece bofa, skipping " )
continue
2018-10-27 02:28:20 -06:00
2018-10-24 20:37:11 -06:00
# print("{} is on {}".format(f.acct, instance))
try :
2018-10-27 06:07:38 -06:00
r = requests . get ( " https:// {} /.well-known/host-meta " . format ( instance ) , timeout = 10 )
2018-10-24 20:37:11 -06:00
uri = re . search ( r ' template= " ([^ " ]+) " ' , r . text ) . group ( 1 )
uri = uri . format ( uri = " {} @ {} " . format ( f . username , instance ) )
2018-10-27 06:07:38 -06:00
r = requests . get ( uri , headers = { " Accept " : " application/json " } , timeout = 10 )
2018-10-27 02:28:20 -06:00
j = r . json ( )
if len ( j [ ' aliases ' ] ) == 1 : #TODO: this is a hack on top of a hack, fix it
uri = j [ ' aliases ' ] [ 0 ]
else :
uri = j [ ' aliases ' ] [ 1 ]
2018-11-06 22:39:12 -07:00
uri = " {} /outbox?page=true " . format ( uri )
2018-10-27 06:07:38 -06:00
r = requests . get ( uri , timeout = 10 )
2018-10-24 20:37:11 -06:00
j = r . json ( )
except Exception :
print ( " oopsy woopsy!! we made a fucky wucky!!! \n (we ' re probably rate limited, please hang up and try again) " )
sys . exit ( 1 )
2018-10-27 02:28:20 -06:00
pleroma = False
2018-11-09 04:26:37 -07:00
if ' first ' in j and type ( j [ ' first ' ] ) != str :
2018-11-06 22:39:12 -07:00
print ( " Pleroma instance detected " )
2018-10-27 02:28:20 -06:00
pleroma = True
2018-11-06 22:39:12 -07:00
j = j [ ' first ' ]
2018-11-09 04:49:33 -07:00
else :
2018-11-06 22:39:12 -07:00
print ( " Mastodon instance detected " )
2018-11-09 04:49:33 -07:00
uri = " {} &min_id= {} " . format ( uri , last_toot )
2018-11-06 22:39:12 -07:00
r = requests . get ( uri )
j = r . json ( )
2019-01-11 05:08:10 -07:00
print ( " Downloading and saving toots " , end = ' ' , flush = True )
2018-11-06 22:39:12 -07:00
done = False
2018-11-09 04:50:36 -07:00
try :
2018-11-06 22:39:12 -07:00
while not done and len ( j [ ' orderedItems ' ] ) > 0 :
for oi in j [ ' orderedItems ' ] :
2018-11-28 12:36:05 -07:00
if oi [ ' type ' ] != " Create " :
continue #not a toost. fuck outta here
# its a toost baby
content = oi [ ' object ' ] [ ' content ' ]
if oi [ ' object ' ] [ ' summary ' ] != None :
#don't download CW'd toots
continue
toot = extract_toot ( content )
# print(toot)
try :
if pleroma :
if c . execute ( " SELECT COUNT(*) FROM toots WHERE id LIKE ? " , ( oi [ ' object ' ] [ ' id ' ] , ) ) . fetchone ( ) [ 0 ] > 0 :
#we've caught up to the notices we've already downloaded, so we can stop now
done = True
break
pid = re . search ( r " [^ \ /]+$ " , oi [ ' object ' ] [ ' id ' ] ) . group ( 0 )
c . execute ( " REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?) " ,
( pid ,
f . id ,
oi [ ' object ' ] [ ' id ' ] ,
toot
2018-11-06 22:39:12 -07:00
)
2018-11-28 12:36:05 -07:00
)
pass
except :
pass #ignore any toots that don't successfully go into the DB
2018-11-06 22:39:12 -07:00
# sys.exit(0)
if not pleroma :
r = requests . get ( j [ ' prev ' ] , timeout = 15 )
else :
r = requests . get ( j [ ' next ' ] , timeout = 15 )
j = r . json ( )
print ( ' . ' , end = ' ' , flush = True )
2018-10-24 20:37:11 -06:00
print ( " Done! " )
db . commit ( )
2018-11-09 04:50:36 -07:00
except :
2019-01-11 05:08:10 -07:00
print ( " Encountered an error! Saving toots to database and moving to next followed account. " )
2018-11-09 04:50:36 -07:00
db . commit ( )
2018-10-31 23:27:03 -06:00
# db.close()
print ( " Done! " )
2018-10-08 19:11:51 -06:00
db . commit ( )
db . execute ( " VACUUM " ) #compact db
db . commit ( )
db . close ( )