2021-07-25 22:52:13 -06:00
|
|
|
# SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
|
|
|
import sqlite3
|
|
|
|
import markovify
|
2023-07-19 22:10:54 -06:00
|
|
|
import regex
|
2023-07-24 10:04:32 -06:00
|
|
|
from random import randint
|
2021-07-25 22:52:13 -06:00
|
|
|
|
|
|
|
def make_sentence(cfg):
|
|
|
|
class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences
|
|
|
|
def test_sentence_input(self, sentence):
|
|
|
|
return True # all sentences are valid <3
|
|
|
|
|
2021-07-26 00:47:43 -06:00
|
|
|
db = sqlite3.connect(cfg["db_path"])
|
2021-07-25 22:52:13 -06:00
|
|
|
db.text_factory = str
|
2023-07-19 22:24:18 -06:00
|
|
|
p = regex.compile(r"\L<words>", words=cfg["ignored_cws"],flags=regex.IGNORECASE)
|
2023-07-19 22:10:54 -06:00
|
|
|
def cw_regexp(x):
|
|
|
|
return 1 if p.search(x) else 0
|
|
|
|
db.create_function('cwregexp', 1, cw_regexp)
|
2021-07-25 22:52:13 -06:00
|
|
|
c = db.cursor()
|
|
|
|
if cfg['learn_from_cw']:
|
2021-07-26 00:24:26 -06:00
|
|
|
toots = c.execute(
|
|
|
|
f"""
|
|
|
|
SELECT content
|
|
|
|
FROM posts
|
|
|
|
WHERE
|
|
|
|
summary IS NULL
|
2023-07-19 22:10:54 -06:00
|
|
|
OR NOT CWREGEXP(summary)
|
2021-07-26 00:24:26 -06:00
|
|
|
ORDER BY RANDOM() LIMIT 10000
|
|
|
|
""",
|
|
|
|
).fetchall()
|
2021-07-25 22:52:13 -06:00
|
|
|
else:
|
2021-07-26 00:24:26 -06:00
|
|
|
toots = c.execute(
|
|
|
|
"""
|
|
|
|
SELECT content
|
|
|
|
FROM posts
|
|
|
|
WHERE summary IS NULL
|
|
|
|
ORDER BY RANDOM()
|
|
|
|
LIMIT 10000
|
|
|
|
""",
|
|
|
|
).fetchall()
|
|
|
|
|
|
|
|
if not toots:
|
2021-07-25 22:52:13 -06:00
|
|
|
raise ValueError("Database is empty! Try running main.py.")
|
|
|
|
|
|
|
|
nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed
|
|
|
|
|
2021-08-16 23:55:47 -06:00
|
|
|
# TODO support replicating \n in output posts instead of squashing them together
|
2021-07-25 22:52:13 -06:00
|
|
|
model = nlt("\n".join(toot[0].replace('\n', ' ') for toot in toots))
|
|
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
if cfg['limit_length']:
|
|
|
|
sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit'])
|
|
|
|
|
|
|
|
sentence = None
|
|
|
|
tries = 0
|
|
|
|
for tries in range(10):
|
|
|
|
if (sentence := model.make_short_sentence(
|
|
|
|
max_chars=500,
|
|
|
|
tries=10000,
|
|
|
|
max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7,
|
|
|
|
max_words=sentence_len if cfg['limit_length'] else None
|
|
|
|
)) is not None:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
raise ValueError("Failed 10 times to produce a sentence!")
|
|
|
|
|
|
|
|
return sentence
|