2018-10-08 19:11:51 -06:00
#!/usr/bin/env python3
2018-10-24 20:37:11 -06:00
# toot downloader version two!!
2018-10-08 19:11:51 -06:00
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from mastodon import Mastodon
from os import path
from bs4 import BeautifulSoup
2018-10-24 20:37:11 -06:00
import os , sqlite3 , signal , sys , json , re
import requests
2018-10-08 19:11:51 -06:00
2018-10-28 22:36:21 -06:00
scopes = [ " read:statuses " , " read:accounts " , " read:follows " , " write:statuses " , " read:notifications " ]
2018-10-14 00:58:58 -06:00
cfg = json . load ( open ( ' config.json ' , ' r ' ) )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
if os . path . exists ( " clientcred.secret " ) :
2018-10-27 02:28:20 -06:00
print ( " Upgrading to new storage method " )
cc = open ( " clientcred.secret " ) . read ( ) . split ( " \n " )
cfg [ ' client ' ] = {
" id " : cc [ 0 ] ,
" secret " : cc [ 1 ]
}
cfg [ ' secret ' ] = open ( " usercred.secret " ) . read ( ) . rstrip ( " \n " )
os . remove ( " clientcred.secret " )
os . remove ( " usercred.secret " )
2018-10-24 20:37:11 -06:00
if " client " not in cfg :
print ( " No client credentials, registering application " )
client_id , client_secret = Mastodon . create_app ( " mstdn-ebooks " ,
api_base_url = cfg [ ' site ' ] ,
scopes = scopes ,
website = " https://github.com/Lynnesbian/mstdn-ebooks " )
cfg [ ' client ' ] = {
" id " : client_id ,
" secret " : client_secret
}
if " secret " not in cfg :
print ( " No user credentials, logging in " )
client = Mastodon ( client_id = cfg [ ' client ' ] [ ' id ' ] ,
client_secret = cfg [ ' client ' ] [ ' secret ' ] ,
api_base_url = cfg [ ' site ' ] )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
print ( " Open this URL: {} " . format ( client . auth_request_url ( scopes = scopes ) ) )
cfg [ ' secret ' ] = client . log_in ( code = input ( " Secret: " ) , scopes = scopes )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
json . dump ( cfg , open ( " config.json " , " w+ " ) )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
def extract_toot ( toot ) :
toot = toot . replace ( " ' " , " ' " )
toot = toot . replace ( " " " , ' " ' )
soup = BeautifulSoup ( toot , " html.parser " )
2018-10-08 19:11:51 -06:00
# this is the code that removes all mentions
# TODO: make it so that it removes the @ and instance but keeps the name
for mention in soup . select ( " span.h-card " ) :
2018-10-24 20:37:11 -06:00
mention . a . unwrap ( )
mention . span . unwrap ( )
2018-10-08 19:11:51 -06:00
2018-10-24 20:37:11 -06:00
# replace <br> with linebreak
2018-10-08 19:11:51 -06:00
for lb in soup . select ( " br " ) :
lb . insert_after ( " \n " )
lb . decompose ( )
2018-10-24 20:37:11 -06:00
# replace <p> with linebreak
2018-10-08 19:11:51 -06:00
for p in soup . select ( " p " ) :
p . insert_after ( " \n " )
p . unwrap ( )
2018-10-24 20:37:11 -06:00
# fix hashtags
2018-10-08 19:11:51 -06:00
for ht in soup . select ( " a.hashtag " ) :
ht . unwrap ( )
2018-10-24 20:37:11 -06:00
# fix links
2018-10-08 19:11:51 -06:00
for link in soup . select ( " a " ) :
link . insert_after ( link [ " href " ] )
link . decompose ( )
2018-10-24 20:37:11 -06:00
toot = soup . get_text ( )
toot = toot . rstrip ( " \n " ) #remove trailing newline
toot = toot . replace ( " @ " , " @ \u202B " ) #put a zws between @ and username to avoid mentioning
return ( toot )
2018-10-08 19:11:51 -06:00
client = Mastodon (
2018-10-24 20:37:11 -06:00
client_id = cfg [ ' client ' ] [ ' id ' ] ,
client_secret = cfg [ ' client ' ] [ ' secret ' ] ,
access_token = cfg [ ' secret ' ] ,
api_base_url = cfg [ ' site ' ] )
2018-10-08 19:11:51 -06:00
me = client . account_verify_credentials ( )
following = client . account_following ( me . id )
db = sqlite3 . connect ( " toots.db " )
db . text_factory = str
c = db . cursor ( )
2018-10-24 20:37:11 -06:00
c . execute ( " CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID " )
2018-10-08 19:11:51 -06:00
db . commit ( )
def handleCtrlC ( signal , frame ) :
print ( " \n PREMATURE EVACUATION - Saving chunks " )
db . commit ( )
sys . exit ( 1 )
signal . signal ( signal . SIGINT , handleCtrlC )
2018-10-27 02:28:20 -06:00
def get_toots_legacy ( client , id ) :
i = 0
toots = client . account_statuses ( id )
while toots is not None and len ( toots ) > 0 :
for toot in toots :
if toot . spoiler_text != " " : continue
if toot . reblog is not None : continue
if toot . visibility not in [ " public " , " unlisted " ] : continue
t = extract_toot ( toot . content )
if t != None :
yield {
" toot " : t ,
" id " : toot . id ,
" uri " : toot . uri
}
toots = client . fetch_next ( toots )
i + = 1
if i % 20 == 0 :
print ( ' . ' , end = ' ' , flush = True )
2018-10-08 19:11:51 -06:00
for f in following :
last_toot = c . execute ( " SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1 " , ( f . id , ) ) . fetchone ( )
if last_toot != None :
last_toot = last_toot [ 0 ]
else :
last_toot = 0
2018-10-24 20:37:11 -06:00
print ( " Harvesting toots for user @ {} , starting from {} " . format ( f . acct , last_toot ) )
#find the user's activitypub outbox
2018-10-27 02:28:20 -06:00
print ( " WebFingering... " )
2018-10-24 20:37:11 -06:00
instance = re . search ( r " ^.*@(.+) " , f . acct )
if instance == None :
instance = re . search ( r " https?: \ / \ /(.*) " , cfg [ ' site ' ] ) . group ( 1 )
else :
instance = instance . group ( 1 )
2018-10-25 08:33:57 -06:00
if instance == " bofa.lol " :
print ( " rest in piece bofa, skipping " )
continue
2018-10-27 02:28:20 -06:00
2018-10-24 20:37:11 -06:00
# print("{} is on {}".format(f.acct, instance))
try :
2018-10-27 06:07:38 -06:00
r = requests . get ( " https:// {} /.well-known/host-meta " . format ( instance ) , timeout = 10 )
2018-10-24 20:37:11 -06:00
uri = re . search ( r ' template= " ([^ " ]+) " ' , r . text ) . group ( 1 )
uri = uri . format ( uri = " {} @ {} " . format ( f . username , instance ) )
2018-10-27 06:07:38 -06:00
r = requests . get ( uri , headers = { " Accept " : " application/json " } , timeout = 10 )
2018-10-27 02:28:20 -06:00
j = r . json ( )
if len ( j [ ' aliases ' ] ) == 1 : #TODO: this is a hack on top of a hack, fix it
uri = j [ ' aliases ' ] [ 0 ]
else :
uri = j [ ' aliases ' ] [ 1 ]
2018-10-24 20:37:11 -06:00
uri = " {} /outbox?page=true&min_id= {} " . format ( uri , last_toot )
2018-10-27 06:07:38 -06:00
r = requests . get ( uri , timeout = 10 )
2018-10-24 20:37:11 -06:00
j = r . json ( )
except Exception :
print ( " oopsy woopsy!! we made a fucky wucky!!! \n (we ' re probably rate limited, please hang up and try again) " )
sys . exit ( 1 )
2018-10-27 02:28:20 -06:00
pleroma = False
if ' first ' in j :
print ( " {} is a pleroma instance -- falling back to legacy toot collection method " . format ( instance ) )
pleroma = True
2018-10-24 20:37:11 -06:00
print ( " Downloading and parsing toots " , end = ' ' , flush = True )
current = None
try :
2018-10-27 02:28:20 -06:00
if pleroma :
for t in get_toots_legacy ( client , f . id ) :
try :
c . execute ( " REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?) " ,
( t [ ' id ' ] ,
f . id ,
t [ ' uri ' ] ,
t [ ' toot ' ]
2018-10-24 20:37:11 -06:00
)
2018-10-27 02:28:20 -06:00
)
except :
pass
else :
while len ( j [ ' orderedItems ' ] ) > 0 :
for oi in j [ ' orderedItems ' ] :
if ( not pleroma and oi [ ' type ' ] == " Create " ) or ( pleroma and oi [ ' to ' ] [ ' type ' ] == " Create " ) :
# its a toost baby
content = oi [ ' object ' ] [ ' content ' ]
if oi [ ' object ' ] [ ' summary ' ] != None :
#don't download CW'd toots
continue
toot = extract_toot ( content )
# print(toot)
try :
pid = re . search ( r " [^ \ /]+$ " , oi [ ' object ' ] [ ' id ' ] ) . group ( 0 )
c . execute ( " REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?) " ,
( pid ,
f . id ,
oi [ ' object ' ] [ ' id ' ] ,
toot
)
)
pass
except :
pass #ignore any toots that don't go into the DB
# sys.exit(0)
2018-10-27 06:07:38 -06:00
r = requests . get ( j [ ' prev ' ] , timeout = 10 )
2018-10-27 02:28:20 -06:00
j = r . json ( )
print ( ' . ' , end = ' ' , flush = True )
2018-10-24 20:37:11 -06:00
print ( " Done! " )
db . commit ( )
except :
2018-10-31 23:27:03 -06:00
print ( " Encountered an error! Saving toots to database and continuing. " )
2018-10-24 20:37:11 -06:00
db . commit ( )
2018-10-31 23:27:03 -06:00
# db.close()
print ( " Done! " )
2018-10-08 19:11:51 -06:00
db . commit ( )
db . execute ( " VACUUM " ) #compact db
db . commit ( )
db . close ( )