From c03c0257737ff92657c0b3312404675b90f7327a Mon Sep 17 00:00:00 2001
From: smitten <everything-cozy@pm.me>
Date: Thu, 20 Jul 2023 00:10:54 -0400
Subject: [PATCH] Support for CW partial regexp match by word. Update
 documentation with 'How to Use'

---
 README.md            | 15 ++++++++++++---
 generators/markov.py |  9 ++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index f736ca4..bcb07f0 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,15 @@ It's like [@AgathaSorceress's mstdn-ebooks] but it supports Pleroma better.
 ## Secure Fetch
 Secure fetch (aka authorised fetches, authenticated fetches, secure mode...) is *not* supported by pleroma-ebooks, and will fail to download any posts from users on instances with secure fetch enabled. For more information, see [this wiki page](https://github.com/Lynnesbian/mstdn-ebooks/wiki/Secure-fetch).
 
+
+## How to Use
+1. Create your bot account on the server.
+2. Follow the user(s) you want to base the model on.
+3. Get an access token for your bot. See [mastodon-bot](https://tinysubversions.com/notes/mastodon-bot/) for details.
+4. Copy `config.defaults.json` to `config.json` and set as `access_token`. Make any other config tweaks you'd like.
+5. Run `fetch_posts.py` to collect the posts from the followed user(s).
+6. Run `gen.py` to generate the sentence and write it to the server.
+
 ## Compatibility
 | Software  | Downloading statuses                                              | Posting | Replying                                                    |
 |-----------|-------------------------------------------------------------------|---------|-------------------------------------------------------------|
@@ -31,11 +40,11 @@ Configuring pleroma-ebooks is accomplished by editing `config.json`. If you want
 | site                     | https://botsin.space                    | The instance your bot will log in to and post from. This must start with `https://` or `http://` (preferably the latter)                                                                                                                                                                |
 | cw                       | null                                    | The content warning (aka subject) pleroma-ebooks will apply to non-error posts.                                                                                                                                                                                                           |
 | learn_from_cw            | false                                   | If true, pleroma-ebooks will learn from CW'd posts.                                                                                                                                                                                                                                       |
-| ignored_cws              | []                                      | If `learn_from_cw` is true, do not learn from posts with these CWs.
+| ignored_cws              | []                                      | If `learn_from_cw` is true, do not learn from posts with these CW words. matches case-insensitive and by word.
 | mention_handling         | 1                                       | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour).                                                                                                                                            |
 | max_thread_length        | 15                                      | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times.                                                                                                      |
 | strip_paired_punctuation | false                                   | If true, pleroma-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it.                                                                                                          |
-| limit_length             | false                                   | If true, the sentence length will be random between `length_lower_limit` and `length_upper_limit`                                                                                                                                                                                       |
+| limit_length             | false                                   | If true, the sentence word length will be random between `length_lower_limit` and `length_upper_limit`                                                                                                                                                                                       |
 | length_lower_limit       | 5                                       | The lower bound in the random number range above. Only matters if `limit_length` is true.                                                                                                                                                                                               |
 | length_upper_limit       | 50                                      | The upper bound in the random number range above. Can be the same as `length_lower_limit` to disable randomness. Only matters if `limit_length` is true.                                                                                                                                |
 | overlap_ratio_enabled    | false                                   | If true, checks the output's similarity to the original posts.                                                                                                                                                                                                                          |
@@ -51,4 +60,4 @@ Please don't feel obligated to donate at all.
 
 This is released under the AGPLv3 (only) license, and based on Lynnesbian's fork which is under the MPL 2.0 license. See LICENSE-AGPL.md and LICENSE-MPL for details.
 
-**This means you must publish the source code of any ebooks bot you make with this.** A link back to this repository on your bot's profile page or profile metadata will suffice. If you make changes to the code you need to link to your fork/repo instead
+**This means you must publish the source code of any ebooks bot you make with this.** A link back to this repository on your bot's profile page or profile metadata will suffice. If you make changes to the code you need to link to your fork/repo instead.
diff --git a/generators/markov.py b/generators/markov.py
index 5bf7a0e..4352c5e 100644
--- a/generators/markov.py
+++ b/generators/markov.py
@@ -2,6 +2,7 @@
 
 import sqlite3
 import markovify
+import regex
 
 def make_sentence(cfg):
 	class nlt_fixed(markovify.NewlineText):  # modified version of NewlineText that never rejects sentences
@@ -10,19 +11,21 @@ def make_sentence(cfg):
 
 	db = sqlite3.connect(cfg["db_path"])
 	db.text_factory = str
+	def cw_regexp(x):
+		p = regex.compile(r"\L<words>", words=cfg["ignored_cws"],flags=regex.IGNORECASE)
+		return 1 if p.search(x) else 0
+	db.create_function('cwregexp', 1, cw_regexp)
 	c = db.cursor()
 	if cfg['learn_from_cw']:
-		ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")"
 		toots = c.execute(
 			f"""
 			SELECT content
 			FROM posts
 			WHERE
 				summary IS NULL
-				OR summary NOT IN {ignored_cws_query_params}
+				OR NOT CWREGEXP(summary)
 			ORDER BY RANDOM() LIMIT 10000
 			""",
-			cfg["ignored_cws"],
 		).fetchall()
 	else:
 		toots = c.execute(