From c03c0257737ff92657c0b3312404675b90f7327a Mon Sep 17 00:00:00 2001 From: smitten Date: Thu, 20 Jul 2023 00:10:54 -0400 Subject: [PATCH] Support for CW partial regexp match by word. Update documentation with 'How to Use' --- README.md | 15 ++++++++++++--- generators/markov.py | 9 ++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f736ca4..bcb07f0 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,15 @@ It's like [@AgathaSorceress's mstdn-ebooks] but it supports Pleroma better. ## Secure Fetch Secure fetch (aka authorised fetches, authenticated fetches, secure mode...) is *not* supported by pleroma-ebooks, and will fail to download any posts from users on instances with secure fetch enabled. For more information, see [this wiki page](https://github.com/Lynnesbian/mstdn-ebooks/wiki/Secure-fetch). + +## How to Use +1. Create your bot account on the server. +2. Follow the user(s) you want to base the model on. +3. Get an access token for your bot. See [mastodon-bot](https://tinysubversions.com/notes/mastodon-bot/) for details. +4. Copy `config.defaults.json` to `config.json` and set as `access_token`. Make any other config tweaks you'd like. +5. Run `fetch_posts.py` to collect the posts from the followed user(s). +6. Run `gen.py` to generate the sentence and write it to the server. + ## Compatibility | Software | Downloading statuses | Posting | Replying | |-----------|-------------------------------------------------------------------|---------|-------------------------------------------------------------| @@ -31,11 +40,11 @@ Configuring pleroma-ebooks is accomplished by editing `config.json`. If you want | site | https://botsin.space | The instance your bot will log in to and post from. This must start with `https://` or `http://` (preferably the latter) | | cw | null | The content warning (aka subject) pleroma-ebooks will apply to non-error posts. | | learn_from_cw | false | If true, pleroma-ebooks will learn from CW'd posts. | -| ignored_cws | [] | If `learn_from_cw` is true, do not learn from posts with these CWs. +| ignored_cws | [] | If `learn_from_cw` is true, do not learn from posts with these CW words. matches case-insensitive and by word. | mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). | | max_thread_length | 15 | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times. | | strip_paired_punctuation | false | If true, pleroma-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it. | -| limit_length | false | If true, the sentence length will be random between `length_lower_limit` and `length_upper_limit` | +| limit_length | false | If true, the sentence word length will be random between `length_lower_limit` and `length_upper_limit` | | length_lower_limit | 5 | The lower bound in the random number range above. Only matters if `limit_length` is true. | | length_upper_limit | 50 | The upper bound in the random number range above. Can be the same as `length_lower_limit` to disable randomness. Only matters if `limit_length` is true. | | overlap_ratio_enabled | false | If true, checks the output's similarity to the original posts. | @@ -51,4 +60,4 @@ Please don't feel obligated to donate at all. This is released under the AGPLv3 (only) license, and based on Lynnesbian's fork which is under the MPL 2.0 license. See LICENSE-AGPL.md and LICENSE-MPL for details. -**This means you must publish the source code of any ebooks bot you make with this.** A link back to this repository on your bot's profile page or profile metadata will suffice. If you make changes to the code you need to link to your fork/repo instead +**This means you must publish the source code of any ebooks bot you make with this.** A link back to this repository on your bot's profile page or profile metadata will suffice. If you make changes to the code you need to link to your fork/repo instead. diff --git a/generators/markov.py b/generators/markov.py index 5bf7a0e..4352c5e 100644 --- a/generators/markov.py +++ b/generators/markov.py @@ -2,6 +2,7 @@ import sqlite3 import markovify +import regex def make_sentence(cfg): class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences @@ -10,19 +11,21 @@ def make_sentence(cfg): db = sqlite3.connect(cfg["db_path"]) db.text_factory = str + def cw_regexp(x): + p = regex.compile(r"\L", words=cfg["ignored_cws"],flags=regex.IGNORECASE) + return 1 if p.search(x) else 0 + db.create_function('cwregexp', 1, cw_regexp) c = db.cursor() if cfg['learn_from_cw']: - ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")" toots = c.execute( f""" SELECT content FROM posts WHERE summary IS NULL - OR summary NOT IN {ignored_cws_query_params} + OR NOT CWREGEXP(summary) ORDER BY RANDOM() LIMIT 10000 """, - cfg["ignored_cws"], ).fetchall() else: toots = c.execute(