From 762b0fd32005e3a403dcb44bae32a53f2f6d1777 Mon Sep 17 00:00:00 2001 From: bobloy Date: Fri, 25 Sep 2020 12:02:13 -0400 Subject: [PATCH 01/21] WIP Twitter training --- chatter/chat.py | 21 ++++++++++---------- chatter/trainers.py | 48 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 10 deletions(-) create mode 100644 chatter/trainers.py diff --git a/chatter/chat.py b/chatter/chat.py index ad8e37b..607457c 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -15,6 +15,8 @@ from redbot.core.commands import Cog from redbot.core.data_manager import cog_data_path from redbot.core.utils.predicates import MessagePredicate +from chatter.trainers import TwitterCorpusTrainer + log = logging.getLogger("red.fox_v3.chatter") @@ -105,15 +107,7 @@ class Chatter(Cog): return msg.clean_content def new_conversation(msg, sent, out_in, delta): - # if sent is None: - # return False - - # Don't do "too short" processing here. Sometimes people don't respond. - # if len(out_in) < 2: - # return False - - # print(msg.created_at - sent) - + # Should always be positive numbers return msg.created_at - sent >= delta for channel in ctx.guild.text_channels: @@ -158,6 +152,11 @@ class Chatter(Cog): return out + def _train_twitter(self, *args, **kwargs): + trainer = TwitterCorpusTrainer(self.chatbot) + trainer.train(*args, **kwargs) + return True + def _train_ubuntu(self): trainer = UbuntuCorpusTrainer(self.chatbot) trainer.train() @@ -479,7 +478,9 @@ class Chatter(Cog): text = message.clean_content async with channel.typing(): - future = await self.loop.run_in_executor(None, self.chatbot.get_response, text) + # Switched to `generate_response` from `get_result` + # Switch back once better conversation detection is used. + future = await self.loop.run_in_executor(None, self.chatbot.generate_response, text) if future and str(future): await channel.send(str(future)) diff --git a/chatter/trainers.py b/chatter/trainers.py new file mode 100644 index 0000000..e6eedba --- /dev/null +++ b/chatter/trainers.py @@ -0,0 +1,48 @@ +from chatterbot import utils +from chatterbot.conversation import Statement +from chatterbot.trainers import Trainer + + +class TwitterCorpusTrainer(Trainer): + def train(self, *args, **kwargs): + """ + Train the chat bot based on the provided list of + statements that represents a single conversation. + """ + import twint + + c = twint.Config() + c.__dict__.update(kwargs) + twint.run.Search(c) + + + previous_statement_text = None + previous_statement_search_text = '' + + statements_to_create = [] + + for conversation_count, text in enumerate(conversation): + if self.show_training_progress: + utils.print_progress_bar( + 'List Trainer', + conversation_count + 1, len(conversation) + ) + + statement_search_text = self.chatbot.storage.tagger.get_text_index_string(text) + + statement = self.get_preprocessed_statement( + Statement( + text=text, + search_text=statement_search_text, + in_response_to=previous_statement_text, + search_in_response_to=previous_statement_search_text, + conversation='training' + ) + ) + + previous_statement_text = statement.text + previous_statement_search_text = statement_search_text + + statements_to_create.append(statement) + + self.chatbot.storage.create_many(statements_to_create) \ No newline at end of file From 26234e3b18a465ded651960a73ed7d15692a53fb Mon Sep 17 00:00:00 2001 From: bobloy Date: Mon, 19 Oct 2020 15:16:49 -0400 Subject: [PATCH 02/21] Alternate dependencies attempt --- chatter/info.json | 3 +- chatter/trainers.py | 85 +++++++++++++++++++++++---------------------- 2 files changed, 45 insertions(+), 43 deletions(-) diff --git a/chatter/info.json b/chatter/info.json index b79e587..df77ee8 100644 --- a/chatter/info.json +++ b/chatter/info.json @@ -17,7 +17,8 @@ "pytz", "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm", "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.1/en_core_web_md-2.3.1.tar.gz#egg=en_core_web_md", - "spacy>=2.3,<2.4" + "spacy>=2.3,<2.4", + "--no-deps \"chatterbot>=1.1\"" ], "short": "Local Chatbot run on machine learning", "end_user_data_statement": "This cog only stores anonymous conversations data; no End User Data is stored.", diff --git a/chatter/trainers.py b/chatter/trainers.py index e6eedba..42d6288 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -4,45 +4,46 @@ from chatterbot.trainers import Trainer class TwitterCorpusTrainer(Trainer): - def train(self, *args, **kwargs): - """ - Train the chat bot based on the provided list of - statements that represents a single conversation. - """ - import twint - - c = twint.Config() - c.__dict__.update(kwargs) - twint.run.Search(c) - - - previous_statement_text = None - previous_statement_search_text = '' - - statements_to_create = [] - - for conversation_count, text in enumerate(conversation): - if self.show_training_progress: - utils.print_progress_bar( - 'List Trainer', - conversation_count + 1, len(conversation) - ) - - statement_search_text = self.chatbot.storage.tagger.get_text_index_string(text) - - statement = self.get_preprocessed_statement( - Statement( - text=text, - search_text=statement_search_text, - in_response_to=previous_statement_text, - search_in_response_to=previous_statement_search_text, - conversation='training' - ) - ) - - previous_statement_text = statement.text - previous_statement_search_text = statement_search_text - - statements_to_create.append(statement) - - self.chatbot.storage.create_many(statements_to_create) \ No newline at end of file + pass + # def train(self, *args, **kwargs): + # """ + # Train the chat bot based on the provided list of + # statements that represents a single conversation. + # """ + # import twint + # + # c = twint.Config() + # c.__dict__.update(kwargs) + # twint.run.Search(c) + # + # + # previous_statement_text = None + # previous_statement_search_text = '' + # + # statements_to_create = [] + # + # for conversation_count, text in enumerate(conversation): + # if self.show_training_progress: + # utils.print_progress_bar( + # 'List Trainer', + # conversation_count + 1, len(conversation) + # ) + # + # statement_search_text = self.chatbot.storage.tagger.get_text_index_string(text) + # + # statement = self.get_preprocessed_statement( + # Statement( + # text=text, + # search_text=statement_search_text, + # in_response_to=previous_statement_text, + # search_in_response_to=previous_statement_search_text, + # conversation='training' + # ) + # ) + # + # previous_statement_text = statement.text + # previous_statement_search_text = statement_search_text + # + # statements_to_create.append(statement) + # + # self.chatbot.storage.create_many(statements_to_create) \ No newline at end of file From a6ebe02233eadd97cedc3191b680d3a3040dd8fe Mon Sep 17 00:00:00 2001 From: bobloy Date: Mon, 19 Oct 2020 16:09:21 -0400 Subject: [PATCH 03/21] Back to basics --- chatter/info.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/chatter/info.json b/chatter/info.json index df77ee8..b79e587 100644 --- a/chatter/info.json +++ b/chatter/info.json @@ -17,8 +17,7 @@ "pytz", "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm", "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.1/en_core_web_md-2.3.1.tar.gz#egg=en_core_web_md", - "spacy>=2.3,<2.4", - "--no-deps \"chatterbot>=1.1\"" + "spacy>=2.3,<2.4" ], "short": "Local Chatbot run on machine learning", "end_user_data_statement": "This cog only stores anonymous conversations data; no End User Data is stored.", From 46342109604e2824a3bd011dfbd880fe3909e91c Mon Sep 17 00:00:00 2001 From: bobloy Date: Mon, 19 Oct 2020 16:24:39 -0400 Subject: [PATCH 04/21] Add automatic install option --- chatter/README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/chatter/README.md b/chatter/README.md index 8ef6734..c831bb8 100644 --- a/chatter/README.md +++ b/chatter/README.md @@ -59,6 +59,35 @@ Install these on your windows machine before attempting the installation: [Pandoc - Universal Document Converter](https://pandoc.org/installing.html) ## Methods +### Automatic + +This method requires some luck to pull off. + +#### Step 1: Add repo and install cog + +``` +[p]repo add Fox https://github.com/bobloy/Fox-V3 +[p]cog install Fox chatter +``` + +If you get an error at this step, stop and skip to one of the manual methods below. + +#### Step 2: Install additional dependencies + +Assuming the previous commands had no error, you can now use `pipinstall` to add the remaining dependencies. + +NOTE: This method is not the intended use case for `pipinstall` and may stop working in the future. + +``` +[p]pipinstall --no-deps chatterbot>=1.1 +``` + +#### Step 3: Load the cog and get started + +``` +[p]load chatter +``` + ### Windows - Manually #### Step 1: Built-in Downloader From 14f8b825d8a5a81d12aa885da7236b13e97964d0 Mon Sep 17 00:00:00 2001 From: bobloy Date: Tue, 2 Feb 2021 16:35:41 -0500 Subject: [PATCH 05/21] Fix bad learning and checks --- chatter/chat.py | 55 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/chatter/chat.py b/chatter/chat.py index 0988d46..a0a5f28 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -2,8 +2,10 @@ import asyncio import logging import os import pathlib +from collections import defaultdict from datetime import datetime, timedelta -from typing import Optional +from functools import partial +from typing import Dict, Optional import discord from chatterbot import ChatBot @@ -75,6 +77,10 @@ class Chatter(Cog): self.loop = asyncio.get_event_loop() + self._guild_cache = defaultdict(dict) + + self._last_message_per_channel: Dict[Optional[discord.Message]] = defaultdict(lambda: None) + async def red_delete_data_for_user(self, **kwargs): """Nothing to delete""" return @@ -190,6 +196,7 @@ class Chatter(Cog): if ctx.invoked_subcommand is None: pass + @commands.admin() @chatter.command(name="channel") async def chatter_channel( self, ctx: commands.Context, channel: Optional[discord.TextChannel] = None @@ -209,6 +216,7 @@ class Chatter(Cog): await self.config.guild(ctx.guild).chatchannel.set(channel.id) await ctx.maybe_send_embed(f"Chat channel is now {channel.mention}") + @commands.is_owner() @chatter.command(name="cleardata") async def chatter_cleardata(self, ctx: commands.Context, confirm: bool = False): """ @@ -241,6 +249,7 @@ class Chatter(Cog): await ctx.tick() + @commands.is_owner() @chatter.command(name="algorithm", aliases=["algo"]) async def chatter_algorithm( self, ctx: commands.Context, algo_number: int, threshold: float = None @@ -274,6 +283,7 @@ class Chatter(Cog): await ctx.tick() + @commands.is_owner() @chatter.command(name="model") async def chatter_model(self, ctx: commands.Context, model_number: int): """ @@ -311,6 +321,7 @@ class Chatter(Cog): f"Model has been switched to {self.tagger_language.ISO_639_1}" ) + @commands.is_owner() @chatter.command(name="minutes") async def minutes(self, ctx: commands.Context, minutes: int): """ @@ -322,10 +333,12 @@ class Chatter(Cog): await ctx.send_help() return - await self.config.guild(ctx.guild).convo_length.set(minutes) + await self.config.guild(ctx.guild).convo_delta.set(minutes) + self._guild_cache[ctx.guild.id]["convo_delta"] = minutes await ctx.tick() + @commands.is_owner() @chatter.command(name="age") async def age(self, ctx: commands.Context, days: int): """ @@ -340,6 +353,7 @@ class Chatter(Cog): await self.config.guild(ctx.guild).days.set(days) await ctx.tick() + @commands.is_owner() @chatter.command(name="backup") async def backup(self, ctx, backupname): """ @@ -361,6 +375,7 @@ class Chatter(Cog): else: await ctx.maybe_send_embed("Error occurred :(") + @commands.is_owner() @chatter.command(name="trainubuntu") async def chatter_train_ubuntu(self, ctx: commands.Context, confirmation: bool = False): """ @@ -382,6 +397,7 @@ class Chatter(Cog): else: await ctx.send("Error occurred :(") + @commands.is_owner() @chatter.command(name="trainenglish") async def chatter_train_english(self, ctx: commands.Context): """ @@ -395,6 +411,7 @@ class Chatter(Cog): else: await ctx.maybe_send_embed("Error occurred :(") + @commands.is_owner() @chatter.command() async def train(self, ctx: commands.Context, channel: discord.TextChannel): """ @@ -477,12 +494,34 @@ class Chatter(Cog): text = message.clean_content - async with channel.typing(): - # Switched to `generate_response` from `get_result` - # Switch back once better conversation detection is used. - future = await self.loop.run_in_executor(None, self.chatbot.generate_response, text) + async with ctx.typing(): + + if not self._guild_cache[ctx.guild.id]: + self._guild_cache[ctx.guild.id] = await self.config.guild(ctx.guild).all() + + if self._last_message_per_channel[ctx.channel.id] is not None: + last_m: discord.Message = self._last_message_per_channel[ctx.channel.id] + minutes = self._guild_cache[ctx.guild.id]["convo_delta"] + if (datetime.utcnow() - last_m.created_at).seconds > minutes*60: + in_response_to = None + else: + in_response_to = last_m.content + else: + in_response_to = None + + if in_response_to is None: + log.debug("Generating response") + Statement = self.chatbot.storage.get_object('statement') + future = await self.loop.run_in_executor( + None, self.chatbot.generate_response, Statement(text) + ) + else: + log.debug("Getting response") + future = await self.loop.run_in_executor( + None, partial(self.chatbot.get_response, text, in_response_to=in_response_to) + ) if future and str(future): - await channel.send(str(future)) + self._last_message_per_channel[ctx.channel.id] = await ctx.send(str(future)) else: - await channel.send(":thinking:") + await ctx.send(":thinking:") From 337def2fa32ebdcc788e5d785bd388537a6b4899 Mon Sep 17 00:00:00 2001 From: bobloy Date: Mon, 15 Feb 2021 10:18:18 -0500 Subject: [PATCH 06/21] Some progress on updated ubuntu trainer --- chatter/chat.py | 79 +++++++++++++++++++++--- chatter/info.json | 3 +- chatter/trainers.py | 142 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 213 insertions(+), 11 deletions(-) diff --git a/chatter/chat.py b/chatter/chat.py index a0a5f28..098ba73 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -17,7 +17,7 @@ from redbot.core.commands import Cog from redbot.core.data_manager import cog_data_path from redbot.core.utils.predicates import MessagePredicate -from chatter.trainers import TwitterCorpusTrainer +from chatter.trainers import TwitterCorpusTrainer, UbuntuCorpusTrainer2 log = logging.getLogger("red.fox_v3.chatter") @@ -168,6 +168,10 @@ class Chatter(Cog): trainer.train() return True + async def _train_ubuntu2(self): + trainer = UbuntuCorpusTrainer2(self.chatbot, cog_data_path(self)) + await trainer.asynctrain() + def _train_english(self): trainer = ChatterBotCorpusTrainer(self.chatbot) # try: @@ -353,6 +357,15 @@ class Chatter(Cog): await self.config.guild(ctx.guild).days.set(days) await ctx.tick() + @commands.is_owner() + @chatter.command(name="kaggle") + async def chatter_kaggle(self, ctx: commands.Context): + """Register with the kaggle API to download additional datasets for training""" + if not await self.check_for_kaggle(): + await ctx.maybe_send_embed( + "[Click here for instructions to setup the kaggle api](https://github.com/Kaggle/kaggle-api#api-credentials)" + ) + @commands.is_owner() @chatter.command(name="backup") async def backup(self, ctx, backupname): @@ -376,7 +389,13 @@ class Chatter(Cog): await ctx.maybe_send_embed("Error occurred :(") @commands.is_owner() - @chatter.command(name="trainubuntu") + @chatter.group(name="train") + async def chatter_train(self, ctx: commands.Context): + """Commands for training the bot""" + pass + + @commands.is_owner() + @chatter_train.command(name="ubuntu") async def chatter_train_ubuntu(self, ctx: commands.Context, confirmation: bool = False): """ WARNING: Large Download! Trains the bot using Ubuntu Dialog Corpus data. @@ -385,7 +404,7 @@ class Chatter(Cog): if not confirmation: await ctx.maybe_send_embed( "Warning: This command downloads ~500MB then eats your CPU for training\n" - "If you're sure you want to continue, run `[p]chatter trainubuntu True`" + "If you're sure you want to continue, run `[p]chatter train ubuntu True`" ) return @@ -398,7 +417,29 @@ class Chatter(Cog): await ctx.send("Error occurred :(") @commands.is_owner() - @chatter.command(name="trainenglish") + @chatter_train.command(name="ubuntu2") + async def chatter_train_ubuntu2(self, ctx: commands.Context, confirmation: bool = False): + """ + WARNING: Large Download! Trains the bot using *NEW* Ubuntu Dialog Corpus data. + """ + + if not confirmation: + await ctx.maybe_send_embed( + "Warning: This command downloads ~800 then eats your CPU for training\n" + "If you're sure you want to continue, run `[p]chatter train ubuntu2 True`" + ) + return + + async with ctx.typing(): + future = await self._train_ubuntu2() + + if future: + await ctx.send("Training successful!") + else: + await ctx.send("Error occurred :(") + + @commands.is_owner() + @chatter_train.command(name="english") async def chatter_train_english(self, ctx: commands.Context): """ Trains the bot in english @@ -412,10 +453,27 @@ class Chatter(Cog): await ctx.maybe_send_embed("Error occurred :(") @commands.is_owner() - @chatter.command() - async def train(self, ctx: commands.Context, channel: discord.TextChannel): + @chatter_train.command(name="list") + async def chatter_train_list(self, ctx: commands.Context): + """Trains the bot based on an uploaded list. + + Must be a file in the format of a python list: ['prompt', 'response1', 'response2'] + """ + if not ctx.message.attachments: + await ctx.maybe_send_embed("You must upload a file when using this command") + return + + attachment: discord.Attachment = ctx.message.attachments[0] + + a_bytes = await attachment.read() + + await ctx.send("Not yet implemented") + + @commands.is_owner() + @chatter_train.command(name="channel") + async def chatter_train_channel(self, ctx: commands.Context, channel: discord.TextChannel): """ - Trains the bot based on language in this guild + Trains the bot based on language in this guild. """ await ctx.maybe_send_embed( @@ -502,7 +560,7 @@ class Chatter(Cog): if self._last_message_per_channel[ctx.channel.id] is not None: last_m: discord.Message = self._last_message_per_channel[ctx.channel.id] minutes = self._guild_cache[ctx.guild.id]["convo_delta"] - if (datetime.utcnow() - last_m.created_at).seconds > minutes*60: + if (datetime.utcnow() - last_m.created_at).seconds > minutes * 60: in_response_to = None else: in_response_to = last_m.content @@ -511,7 +569,7 @@ class Chatter(Cog): if in_response_to is None: log.debug("Generating response") - Statement = self.chatbot.storage.get_object('statement') + Statement = self.chatbot.storage.get_object("statement") future = await self.loop.run_in_executor( None, self.chatbot.generate_response, Statement(text) ) @@ -525,3 +583,6 @@ class Chatter(Cog): self._last_message_per_channel[ctx.channel.id] = await ctx.send(str(future)) else: await ctx.send(":thinking:") + + async def check_for_kaggle(self): + return False diff --git a/chatter/info.json b/chatter/info.json index b79e587..a048c23 100644 --- a/chatter/info.json +++ b/chatter/info.json @@ -17,7 +17,8 @@ "pytz", "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm", "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.1/en_core_web_md-2.3.1.tar.gz#egg=en_core_web_md", - "spacy>=2.3,<2.4" + "spacy>=2.3,<2.4", + "kaggle" ], "short": "Local Chatbot run on machine learning", "end_user_data_statement": "This cog only stores anonymous conversations data; no End User Data is stored.", diff --git a/chatter/trainers.py b/chatter/trainers.py index 42d6288..0b765b7 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -1,6 +1,146 @@ +import asyncio +import csv +import logging +import os +import pathlib +import time +from functools import partial + from chatterbot import utils from chatterbot.conversation import Statement +from chatterbot.tagging import PosLemmaTagger from chatterbot.trainers import Trainer +from redbot.core.bot import Red +from dateutil import parser as date_parser +from redbot.core.utils import AsyncIter + +log = logging.getLogger("red.fox_v3.chatter.trainers") + + +class KaggleTrainer(Trainer): + def __init__(self, chatbot, datapath: pathlib.Path, **kwargs): + super().__init__(chatbot, **kwargs) + + self.data_directory = datapath / kwargs.get("downloadpath", "kaggle_download") + + self.kaggle_dataset = kwargs.get( + "kaggle_dataset", + "Cornell-University/movie-dialog-corpus", + ) + + # Create the data directory if it does not already exist + if not os.path.exists(self.data_directory): + os.makedirs(self.data_directory) + + def is_downloaded(self, file_path): + """ + Check if the data file is already downloaded. + """ + if os.path.exists(file_path): + self.chatbot.logger.info("File is already downloaded") + return True + + return False + + async def download(self, dataset): + import kaggle # This triggers the API token check + + future = await asyncio.get_event_loop().run_in_executor( + None, + partial( + kaggle.api.dataset_download_files, + dataset=dataset, + path=self.data_directory, + quiet=False, + unzip=True, + ), + ) + + +class UbuntuCorpusTrainer2(KaggleTrainer): + def __init__(self, chatbot, datapath: pathlib.Path, **kwargs): + super().__init__( + chatbot, + datapath, + downloadpath="ubuntu_data_v2", + kaggle_dataset="rtatman/ubuntu-dialogue-corpus", + **kwargs + ) + + async def asynctrain(self, *args, **kwargs): + extracted_dir = self.data_directory / "Ubuntu-dialogue-corpus" + + # Download and extract the Ubuntu dialog corpus if needed + if not extracted_dir.exists(): + await self.download(self.kaggle_dataset) + else: + log.info("Ubuntu dialogue already downloaded") + if not extracted_dir.exists(): + raise FileNotFoundError("Did not extract in the expected way") + + train_dialogue = kwargs.get("train_dialogue", True) + train_196_dialogue = kwargs.get("train_196", False) + train_301_dialogue = kwargs.get("train_301", False) + + if train_dialogue: + await self.run_dialogue_training(extracted_dir, "dialogueText.csv") + + if train_196_dialogue: + await self.run_dialogue_training(extracted_dir, "dialogueText_196.csv") + + if train_301_dialogue: + await self.run_dialogue_training(extracted_dir, "dialogueText_301.csv") + + async def run_dialogue_training(self, extracted_dir, dialogue_file): + log.info(f"Beginning dialogue training on {dialogue_file}") + start_time = time.time() + + tagger = PosLemmaTagger(language=self.chatbot.storage.tagger.language) + + with open(extracted_dir / dialogue_file, "r", encoding="utf-8") as dg: + reader = csv.DictReader(dg) + + next(reader) # Skip the header + + last_dialogue_id = None + previous_statement_text = None + previous_statement_search_text = "" + statements_from_file = [] + + async for row in AsyncIter(reader): + dialogue_id = row["dialogueID"] + if dialogue_id != last_dialogue_id: + previous_statement_text = None + previous_statement_search_text = "" + last_dialogue_id = dialogue_id + + if len(row) > 0: + statement = Statement( + text=row["text"], + in_response_to=previous_statement_text, + conversation="training", + created_at=date_parser.parse(row["date"]), + persona=row["from"], + ) + + for preprocessor in self.chatbot.preprocessors: + statement = preprocessor(statement) + + statement.search_text = tagger.get_text_index_string(statement.text) + statement.search_in_response_to = previous_statement_search_text + + previous_statement_text = statement.text + previous_statement_search_text = statement.search_text + + statements_from_file.append(statement) + + if statements_from_file: + self.chatbot.storage.create_many(statements_from_file) + + print("Training took", time.time() - start_time, "seconds.") + + def train(self, *args, **kwargs): + log.error("See asynctrain instead") class TwitterCorpusTrainer(Trainer): @@ -46,4 +186,4 @@ class TwitterCorpusTrainer(Trainer): # # statements_to_create.append(statement) # - # self.chatbot.storage.create_many(statements_to_create) \ No newline at end of file + # self.chatbot.storage.create_many(statements_to_create) From 8acbc5d9645e1e23e65d60eb00e929d202c4a3e5 Mon Sep 17 00:00:00 2001 From: bobloy Date: Mon, 15 Mar 2021 15:48:34 -0400 Subject: [PATCH 07/21] Whatever this commit is --- chatter/chat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chatter/chat.py b/chatter/chat.py index e29c317..500284c 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -620,7 +620,9 @@ class Chatter(Cog): replying = message if future and str(future): - self._last_message_per_channel[ctx.channel.id] = await channel.send(str(future), reference=replying) + self._last_message_per_channel[ctx.channel.id] = await channel.send( + str(future), reference=replying + ) else: await ctx.send(":thinking:") From 7811c71edbcb7b059e0181fc87cfb3934ba95c53 Mon Sep 17 00:00:00 2001 From: bobloy Date: Tue, 16 Mar 2021 16:00:42 -0400 Subject: [PATCH 08/21] Use is_reply to train --- chatter/chat.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/chatter/chat.py b/chatter/chat.py index 500284c..81d09a8 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -564,13 +564,13 @@ class Chatter(Cog): # Thank you Cog-Creators channel: discord.TextChannel = message.channel - # is_reply = False # this is only useful with in_response_to + is_reply = False # this is only useful with in_response_to if ( message.reference is not None and isinstance(message.reference.resolved, discord.Message) and message.reference.resolved.author.id == self.bot.user.id ): - # is_reply = True # this is only useful with in_response_to + is_reply = True # this is only useful with in_response_to pass # this is a reply to the bot, good to go elif guild is not None and channel.id == await self.config.guild(guild).chatchannel(): pass # good to go @@ -592,7 +592,9 @@ class Chatter(Cog): if not self._guild_cache[ctx.guild.id]: self._guild_cache[ctx.guild.id] = await self.config.guild(ctx.guild).all() - if self._last_message_per_channel[ctx.channel.id] is not None: + if is_reply: + in_response_to = message.reference.resolved.content + elif self._last_message_per_channel[ctx.channel.id] is not None: last_m: discord.Message = self._last_message_per_channel[ctx.channel.id] minutes = self._guild_cache[ctx.guild.id]["convo_delta"] if (datetime.utcnow() - last_m.created_at).seconds > minutes * 60: @@ -602,16 +604,25 @@ class Chatter(Cog): else: in_response_to = None - if in_response_to is None: - log.debug("Generating response") - Statement = self.chatbot.storage.get_object("statement") - future = await self.loop.run_in_executor( - None, self.chatbot.generate_response, Statement(text) - ) - else: - log.debug("Getting response") - future = await self.loop.run_in_executor( - None, partial(self.chatbot.get_response, text, in_response_to=in_response_to) + # Always use generate reponse + # Chatterbot tries to learn based on the result it comes up with, which is dumb + log.debug("Generating response") + Statement = self.chatbot.storage.get_object("statement") + future = await self.loop.run_in_executor( + None, self.chatbot.generate_response, Statement(text) + ) + + if in_response_to is not None: + log.debug("learning response") + learning_task = asyncio.create_task( + self.loop.run_in_executor( + None, + partial( + self.chatbot.learn_response, + Statement(text), + previous_statement=in_response_to, + ), + ) ) replying = None From dad14fe972fa9382b58adcc226d65e0cca4ad620 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 18 Mar 2021 16:08:10 -0400 Subject: [PATCH 09/21] black reformatting --- chatter/trainers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatter/trainers.py b/chatter/trainers.py index 0b765b7..dc0e0b1 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -64,7 +64,7 @@ class UbuntuCorpusTrainer2(KaggleTrainer): datapath, downloadpath="ubuntu_data_v2", kaggle_dataset="rtatman/ubuntu-dialogue-corpus", - **kwargs + **kwargs, ) async def asynctrain(self, *args, **kwargs): From 8200cd9af1dfd33edd04e85e1f34af5988214501 Mon Sep 17 00:00:00 2001 From: bobloy Date: Fri, 19 Mar 2021 15:54:19 -0400 Subject: [PATCH 10/21] Run futures correctly --- chatter/chat.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/chatter/chat.py b/chatter/chat.py index 727efc2..7d3c40f 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -613,15 +613,13 @@ class Chatter(Cog): if in_response_to is not None: log.debug("learning response") - learning_task = asyncio.create_task( - self.loop.run_in_executor( - None, - partial( - self.chatbot.learn_response, - Statement(text), - previous_statement=in_response_to, - ), - ) + await self.loop.run_in_executor( + None, + partial( + self.chatbot.learn_response, + Statement(text), + previous_statement=in_response_to, + ), ) replying = None @@ -637,4 +635,6 @@ class Chatter(Cog): await ctx.send(":thinking:") async def check_for_kaggle(self): + """Check whether Kaggle is installed and configured properly""" + # TODO: This return False From eac7aee82c4ab29a40f79d2f1dbb16556d58672f Mon Sep 17 00:00:00 2001 From: bobloy Date: Fri, 19 Mar 2021 15:54:35 -0400 Subject: [PATCH 11/21] Save every 50 instead of all at once, so it can be cancelled --- chatter/trainers.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/chatter/trainers.py b/chatter/trainers.py index dc0e0b1..1fe5f62 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -107,19 +107,27 @@ class UbuntuCorpusTrainer2(KaggleTrainer): previous_statement_search_text = "" statements_from_file = [] + save_every = 50 + count = 0 + async for row in AsyncIter(reader): dialogue_id = row["dialogueID"] if dialogue_id != last_dialogue_id: previous_statement_text = None previous_statement_search_text = "" last_dialogue_id = dialogue_id + count += 1 + if count >= save_every: + if statements_from_file: + self.chatbot.storage.create_many(statements_from_file) + count = 0 if len(row) > 0: statement = Statement( text=row["text"], in_response_to=previous_statement_text, conversation="training", - created_at=date_parser.parse(row["date"]), + # created_at=date_parser.parse(row["date"]), persona=row["from"], ) From 04ccb435f8512b79a1c02759cd8a459d04f120a0 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 09:51:41 -0400 Subject: [PATCH 12/21] Implement `check_same_thread` = False storage adapter. Add start of AsyncSQLStorageAdapter --- chatter/storage_adapters.py | 73 +++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 chatter/storage_adapters.py diff --git a/chatter/storage_adapters.py b/chatter/storage_adapters.py new file mode 100644 index 0000000..4de2f00 --- /dev/null +++ b/chatter/storage_adapters.py @@ -0,0 +1,73 @@ +from chatterbot.storage import StorageAdapter, SQLStorageAdapter + + +class MyDumbSQLStorageAdapter(SQLStorageAdapter): + def __init__(self, **kwargs): + super(SQLStorageAdapter, self).__init__(**kwargs) + + from sqlalchemy import create_engine + from sqlalchemy.orm import sessionmaker + + self.database_uri = kwargs.get("database_uri", False) + + # None results in a sqlite in-memory database as the default + if self.database_uri is None: + self.database_uri = "sqlite://" + + # Create a file database if the database is not a connection string + if not self.database_uri: + self.database_uri = "sqlite:///db.sqlite3" + + self.engine = create_engine( + self.database_uri, convert_unicode=True, connect_args={"check_same_thread": False} + ) + + if self.database_uri.startswith("sqlite://"): + from sqlalchemy.engine import Engine + from sqlalchemy import event + + @event.listens_for(Engine, "connect") + def set_sqlite_pragma(dbapi_connection, connection_record): + dbapi_connection.execute("PRAGMA journal_mode=WAL") + dbapi_connection.execute("PRAGMA synchronous=NORMAL") + + if not self.engine.dialect.has_table(self.engine, "Statement"): + self.create_database() + + self.Session = sessionmaker(bind=self.engine, expire_on_commit=True) + + +class AsyncSQLStorageAdapter(SQLStorageAdapter): + def __init__(self, **kwargs): + super(SQLStorageAdapter, self).__init__(**kwargs) + + self.database_uri = kwargs.get("database_uri", False) + + # None results in a sqlite in-memory database as the default + if self.database_uri is None: + self.database_uri = "sqlite://" + + # Create a file database if the database is not a connection string + if not self.database_uri: + self.database_uri = "sqlite:///db.sqlite3" + + async def initialize(self): + # from sqlalchemy import create_engine + from aiomysql.sa import create_engine + from sqlalchemy.orm import sessionmaker + + self.engine = await create_engine(self.database_uri, convert_unicode=True) + + if self.database_uri.startswith("sqlite://"): + from sqlalchemy.engine import Engine + from sqlalchemy import event + + @event.listens_for(Engine, "connect") + def set_sqlite_pragma(dbapi_connection, connection_record): + dbapi_connection.execute("PRAGMA journal_mode=WAL") + dbapi_connection.execute("PRAGMA synchronous=NORMAL") + + if not self.engine.dialect.has_table(self.engine, "Statement"): + self.create_database() + + self.Session = sessionmaker(bind=self.engine, expire_on_commit=True) From 8feb21e34b70f26acf12c7d5af46e673032c9dc6 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 09:52:20 -0400 Subject: [PATCH 13/21] Add new kaggle trainers --- chatter/trainers.py | 155 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 151 insertions(+), 4 deletions(-) diff --git a/chatter/trainers.py b/chatter/trainers.py index 1fe5f62..d8de22c 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -1,5 +1,6 @@ import asyncio import csv +import html import logging import os import pathlib @@ -56,13 +57,159 @@ class KaggleTrainer(Trainer): ), ) + def train(self, *args, **kwargs): + log.error("See asynctrain instead") -class UbuntuCorpusTrainer2(KaggleTrainer): + def asynctrain(self, *args, **kwargs): + raise self.TrainerInitializationException() + + +class SouthParkTrainer(KaggleTrainer): def __init__(self, chatbot, datapath: pathlib.Path, **kwargs): super().__init__( chatbot, datapath, downloadpath="ubuntu_data_v2", + kaggle_dataset="tovarischsukhov/southparklines", + **kwargs, + ) + + +class MovieTrainer(KaggleTrainer): + def __init__(self, chatbot, datapath: pathlib.Path, **kwargs): + super().__init__( + chatbot, + datapath, + downloadpath="kaggle_movies", + kaggle_dataset="Cornell-University/movie-dialog-corpus", + **kwargs, + ) + + async def run_movie_training(self): + dialogue_file = "movie_lines.tsv" + conversation_file = "movie_conversations.tsv" + log.info(f"Beginning dialogue training on {dialogue_file}") + start_time = time.time() + + tagger = PosLemmaTagger(language=self.chatbot.storage.tagger.language) + + # [lineID, characterID, movieID, character name, text of utterance] + # File parsing from https://www.kaggle.com/mushaya/conversation-chatbot + + with open(self.data_directory / conversation_file, "r", encoding="utf-8-sig") as conv_tsv: + conv_lines = conv_tsv.readlines() + with open(self.data_directory / dialogue_file, "r", encoding="utf-8-sig") as lines_tsv: + dialog_lines = lines_tsv.readlines() + + # trans_dict = str.maketrans({"": "__", "": "__", '""': '"'}) + + lines_dict = {} + for line in dialog_lines: + _line = line[:-1].strip('"').split("\t") + if len(_line) >= 5: # Only good lines + lines_dict[_line[0]] = ( + html.unescape(("".join(_line[4:])).strip()) + .replace("", "__") + .replace("", "__") + .replace('""', '"') + ) + else: + log.debug(f"Bad line {_line}") + + # collecting line ids for each conversation + conv = [] + for line in conv_lines[:-1]: + _line = line[:-1].split("\t")[-1][1:-1].replace("'", "").replace(" ", ",") + conv.append(_line.split(",")) + + # conversations = csv.reader(conv_tsv, delimiter="\t") + # + # reader = csv.reader(lines_tsv, delimiter="\t") + # + # + # + # lines_dict = {} + # for row in reader: + # try: + # lines_dict[row[0].strip('"')] = row[4] + # except: + # log.exception(f"Bad line: {row}") + # pass + # else: + # # print(f"Good line: {row}") + # pass + # + # # lines_dict = {row[0].strip('"'): row[4] for row in reader_list} + + statements_from_file = [] + + # [characterID of first, characterID of second, movieID, list of utterances] + async for lines in AsyncIter(conv): + previous_statement_text = None + previous_statement_search_text = "" + + for line in lines: + text = lines_dict[line] + statement = Statement( + text=text, + in_response_to=previous_statement_text, + conversation="training", + ) + + for preprocessor in self.chatbot.preprocessors: + statement = preprocessor(statement) + + statement.search_text = tagger.get_text_index_string(statement.text) + statement.search_in_response_to = previous_statement_search_text + + previous_statement_text = statement.text + previous_statement_search_text = statement.search_text + + statements_from_file.append(statement) + + if statements_from_file: + print(statements_from_file) + self.chatbot.storage.create_many(statements_from_file) + statements_from_file = [] + + print("Training took", time.time() - start_time, "seconds.") + + async def asynctrain(self, *args, **kwargs): + extracted_lines = self.data_directory / "movie_lines.tsv" + extracted_lines: pathlib.Path + + # Download and extract the Ubuntu dialog corpus if needed + if not extracted_lines.exists(): + await self.download(self.kaggle_dataset) + else: + log.info("Movie dialog already downloaded") + if not extracted_lines.exists(): + raise FileNotFoundError(f"{extracted_lines}") + + await self.run_movie_training() + + return True + + # train_dialogue = kwargs.get("train_dialogue", True) + # train_196_dialogue = kwargs.get("train_196", False) + # train_301_dialogue = kwargs.get("train_301", False) + # + # if train_dialogue: + # await self.run_dialogue_training(extracted_dir, "dialogueText.csv") + # + # if train_196_dialogue: + # await self.run_dialogue_training(extracted_dir, "dialogueText_196.csv") + # + # if train_301_dialogue: + # await self.run_dialogue_training(extracted_dir, "dialogueText_301.csv") + + +class UbuntuCorpusTrainer2(KaggleTrainer): + def __init__(self, chatbot, datapath: pathlib.Path, **kwargs): + super().__init__( + chatbot, + datapath, + downloadpath="kaggle_ubuntu", kaggle_dataset="rtatman/ubuntu-dialogue-corpus", **kwargs, ) @@ -91,6 +238,8 @@ class UbuntuCorpusTrainer2(KaggleTrainer): if train_301_dialogue: await self.run_dialogue_training(extracted_dir, "dialogueText_301.csv") + return True + async def run_dialogue_training(self, extracted_dir, dialogue_file): log.info(f"Beginning dialogue training on {dialogue_file}") start_time = time.time() @@ -120,6 +269,7 @@ class UbuntuCorpusTrainer2(KaggleTrainer): if count >= save_every: if statements_from_file: self.chatbot.storage.create_many(statements_from_file) + statements_from_file = [] count = 0 if len(row) > 0: @@ -147,9 +297,6 @@ class UbuntuCorpusTrainer2(KaggleTrainer): print("Training took", time.time() - start_time, "seconds.") - def train(self, *args, **kwargs): - log.error("See asynctrain instead") - class TwitterCorpusTrainer(Trainer): pass From ac9cf1e589308e3489a4e4b2d3759faa129009f9 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 09:52:43 -0400 Subject: [PATCH 14/21] Implement movie trainer, guild cache, and learning toggle --- chatter/chat.py | 145 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 109 insertions(+), 36 deletions(-) diff --git a/chatter/chat.py b/chatter/chat.py index 7d3c40f..65966fa 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -17,7 +17,7 @@ from redbot.core.commands import Cog from redbot.core.data_manager import cog_data_path from redbot.core.utils.predicates import MessagePredicate -from chatter.trainers import TwitterCorpusTrainer, UbuntuCorpusTrainer2 +from chatter.trainers import MovieTrainer, TwitterCorpusTrainer, UbuntuCorpusTrainer2 log = logging.getLogger("red.fox_v3.chatter") @@ -63,6 +63,7 @@ class Chatter(Cog): "convo_delta": 15, "chatchannel": None, "reply": True, + "learning": True, } path: pathlib.Path = cog_data_path(self) self.data_path = path / "database.sqlite3" @@ -95,7 +96,8 @@ class Chatter(Cog): return ChatBot( "ChatterBot", - storage_adapter="chatterbot.storage.SQLStorageAdapter", + # storage_adapter="chatterbot.storage.SQLStorageAdapter", + storage_adapter="chatter.storage_adapters.MyDumbSQLStorageAdapter", database_uri="sqlite:///" + str(self.data_path), statement_comparison_function=self.similarity_algo, response_selection_method=get_random_response, @@ -176,10 +178,30 @@ class Chatter(Cog): trainer.train() return True - async def _train_ubuntu2(self): - trainer = UbuntuCorpusTrainer2(self.chatbot, cog_data_path(self)) + async def _train_movies(self): + trainer = MovieTrainer(self.chatbot, cog_data_path(self)) await trainer.asynctrain() + async def _train_ubuntu2(self, intensity): + train_kwarg = {} + if intensity == 196: + train_kwarg["train_dialogue"] = False + train_kwarg["train_196"] = True + elif intensity == 301: + train_kwarg["train_dialogue"] = False + train_kwarg["train_301"] = True + elif intensity == 497: + train_kwarg["train_dialogue"] = False + train_kwarg["train_196"] = True + train_kwarg["train_301"] = True + elif intensity >= 9000: # NOT 9000! + train_kwarg["train_dialogue"] = True + train_kwarg["train_196"] = True + train_kwarg["train_301"] = True + + trainer = UbuntuCorpusTrainer2(self.chatbot, cog_data_path(self)) + return await trainer.asynctrain(**train_kwarg) + def _train_english(self): trainer = ChatterBotCorpusTrainer(self.chatbot) # try: @@ -205,7 +227,7 @@ class Chatter(Cog): """ Base command for this cog. Check help for the commands list. """ - pass + self._guild_cache[ctx.guild.id] = {} # Clear cache when modifying values @commands.admin() @chatter.command(name="channel") @@ -240,19 +262,39 @@ class Chatter(Cog): await self.config.guild(ctx.guild).reply.set(toggle) if toggle: - await ctx.send("I will now respond to you if conversation continuity is not present") + await ctx.maybe_send_embed("I will now respond to you if conversation continuity is not present") else: - await ctx.send( + await ctx.maybe_send_embed( "I will not reply to your message if conversation continuity is not present, anymore" ) + @commands.admin() + @chatter.command(name="learning") + async def chatter_learning(self, ctx: commands.Context, toggle: Optional[bool] = None): + """ + Toggle the bot learning from its conversations. + + This is on by default. + """ + learning = await self.config.guild(ctx.guild).learning() + if toggle is None: + toggle = not learning + await self.config.guild(ctx.guild).learning.set(toggle) + + if toggle: + await ctx.maybe_send_embed("I will now learn from conversations.") + else: + await ctx.maybe_send_embed("I will no longer learn from conversations.") + @commands.is_owner() @chatter.command(name="cleardata") async def chatter_cleardata(self, ctx: commands.Context, confirm: bool = False): """ - This command will erase all training data and reset your configuration settings + This command will erase all training data and reset your configuration settings. - Use `[p]chatter cleardata True` + This applies to all guilds. + + Use `[p]chatter cleardata True` to confirm. """ if not confirm: @@ -364,7 +406,6 @@ class Chatter(Cog): return await self.config.guild(ctx.guild).convo_delta.set(minutes) - self._guild_cache[ctx.guild.id]["convo_delta"] = minutes await ctx.tick() @@ -420,51 +461,85 @@ class Chatter(Cog): """Commands for training the bot""" pass - @commands.is_owner() - @chatter_train.command(name="ubuntu") - async def chatter_train_ubuntu(self, ctx: commands.Context, confirmation: bool = False): + @chatter_train.group(name="kaggle") + async def chatter_train_kaggle(self, ctx: commands.Context): """ - WARNING: Large Download! Trains the bot using Ubuntu Dialog Corpus data. + Base command for kaggle training sets. + + See `[p]chatter kaggle` for details on how to enable this option + """ + pass + + @chatter_train_kaggle.command(name="ubuntu") + async def chatter_train_kaggle_ubuntu( + self, ctx: commands.Context, confirmation: bool = False, intensity=0 + ): + """ + WARNING: Large Download! Trains the bot using *NEW* Ubuntu Dialog Corpus data. """ if not confirmation: await ctx.maybe_send_embed( - "Warning: This command downloads ~500MB then eats your CPU for training\n" - "If you're sure you want to continue, run `[p]chatter train ubuntu True`" + "Warning: This command downloads ~800 then eats your CPU for training\n" + "If you're sure you want to continue, run `[p]chatter train kaggle ubuntu True`" ) return async with ctx.typing(): - future = await self.loop.run_in_executor(None, self._train_ubuntu) + future = await self._train_ubuntu2(intensity) if future: - await ctx.send("Training successful!") + await ctx.maybe_send_embed("Training successful!") else: - await ctx.send("Error occurred :(") + await ctx.maybe_send_embed("Error occurred :(") - @commands.is_owner() - @chatter_train.command(name="ubuntu2") - async def chatter_train_ubuntu2(self, ctx: commands.Context, confirmation: bool = False): + @chatter_train_kaggle.command(name="movies") + async def chatter_train_kaggle_movies(self, ctx: commands.Context, confirmation: bool = False): """ - WARNING: Large Download! Trains the bot using *NEW* Ubuntu Dialog Corpus data. + WARNING: Language! Trains the bot using Cornell University's "Movie Dialog Corpus". + + This training set contains dialog from a spread of movies with different MPAA. + This dialog includes racism, sexism, and any number of sensitive topics. + + Use at your own risk. """ if not confirmation: await ctx.maybe_send_embed( "Warning: This command downloads ~800 then eats your CPU for training\n" - "If you're sure you want to continue, run `[p]chatter train ubuntu2 True`" + "If you're sure you want to continue, run `[p]chatter train kaggle movies True`" ) return async with ctx.typing(): - future = await self._train_ubuntu2() + future = await self._train_movies() if future: - await ctx.send("Training successful!") + await ctx.maybe_send_embed("Training successful!") else: - await ctx.send("Error occurred :(") + await ctx.maybe_send_embed("Error occurred :(") + + @chatter_train.command(name="ubuntu") + async def chatter_train_ubuntu(self, ctx: commands.Context, confirmation: bool = False): + """ + WARNING: Large Download! Trains the bot using Ubuntu Dialog Corpus data. + """ + + if not confirmation: + await ctx.maybe_send_embed( + "Warning: This command downloads ~500MB then eats your CPU for training\n" + "If you're sure you want to continue, run `[p]chatter train ubuntu True`" + ) + return + + async with ctx.typing(): + future = await self.loop.run_in_executor(None, self._train_ubuntu) + + if future: + await ctx.maybe_send_embed("Training successful!") + else: + await ctx.maybe_send_embed("Error occurred :(") - @commands.is_owner() @chatter_train.command(name="english") async def chatter_train_english(self, ctx: commands.Context): """ @@ -478,7 +553,6 @@ class Chatter(Cog): else: await ctx.maybe_send_embed("Error occurred :(") - @commands.is_owner() @chatter_train.command(name="list") async def chatter_train_list(self, ctx: commands.Context): """Trains the bot based on an uploaded list. @@ -495,7 +569,6 @@ class Chatter(Cog): await ctx.send("Not yet implemented") - @commands.is_owner() @chatter_train.command(name="channel") async def chatter_train_channel(self, ctx: commands.Context, channel: discord.TextChannel): """ @@ -563,6 +636,9 @@ class Chatter(Cog): # Thank you Cog-Creators channel: discord.TextChannel = message.channel + if not self._guild_cache[guild.id]: + self._guild_cache[guild.id] = await self.config.guild(guild).all() + is_reply = False # this is only useful with in_response_to if ( message.reference is not None @@ -571,7 +647,7 @@ class Chatter(Cog): ): is_reply = True # this is only useful with in_response_to pass # this is a reply to the bot, good to go - elif guild is not None and channel.id == await self.config.guild(guild).chatchannel(): + elif guild is not None and channel.id == self._guild_cache[guild.id]["chatchannel"]: pass # good to go else: when_mentionables = commands.when_mentioned(self.bot, message) @@ -588,9 +664,6 @@ class Chatter(Cog): async with ctx.typing(): - if not self._guild_cache[ctx.guild.id]: - self._guild_cache[ctx.guild.id] = await self.config.guild(ctx.guild).all() - if is_reply: in_response_to = message.reference.resolved.content elif self._last_message_per_channel[ctx.channel.id] is not None: @@ -611,7 +684,7 @@ class Chatter(Cog): None, self.chatbot.generate_response, Statement(text) ) - if in_response_to is not None: + if in_response_to is not None and self._guild_cache[guild.id]["learning"]: log.debug("learning response") await self.loop.run_in_executor( None, @@ -623,7 +696,7 @@ class Chatter(Cog): ) replying = None - if await self.config.guild(guild).reply(): + if self._guild_cache[guild.id]["reply"]: if message != ctx.channel.last_message: replying = message From b4f20dd7d283ed64ab7429824839a533c8abf2e7 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 09:54:14 -0400 Subject: [PATCH 15/21] Don't print everything, use log --- chatter/trainers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/chatter/trainers.py b/chatter/trainers.py index d8de22c..962fa08 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -136,7 +136,7 @@ class MovieTrainer(KaggleTrainer): # log.exception(f"Bad line: {row}") # pass # else: - # # print(f"Good line: {row}") + # # log.info(f"Good line: {row}") # pass # # # lines_dict = {row[0].strip('"'): row[4] for row in reader_list} @@ -168,11 +168,10 @@ class MovieTrainer(KaggleTrainer): statements_from_file.append(statement) if statements_from_file: - print(statements_from_file) self.chatbot.storage.create_many(statements_from_file) statements_from_file = [] - print("Training took", time.time() - start_time, "seconds.") + log.info("Training took", time.time() - start_time, "seconds.") async def asynctrain(self, *args, **kwargs): extracted_lines = self.data_directory / "movie_lines.tsv" @@ -295,7 +294,7 @@ class UbuntuCorpusTrainer2(KaggleTrainer): if statements_from_file: self.chatbot.storage.create_many(statements_from_file) - print("Training took", time.time() - start_time, "seconds.") + log.info("Training took", time.time() - start_time, "seconds.") class TwitterCorpusTrainer(Trainer): From 59fd96fc5af9d1a0ab9e5c70199f40369381c6ba Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 10:01:56 -0400 Subject: [PATCH 16/21] add save_every for less disk intensive work. --- chatter/trainers.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/chatter/trainers.py b/chatter/trainers.py index 962fa08..adf042f 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -142,6 +142,8 @@ class MovieTrainer(KaggleTrainer): # # lines_dict = {row[0].strip('"'): row[4] for row in reader_list} statements_from_file = [] + save_every = 50 + count = 0 # [characterID of first, characterID of second, movieID, list of utterances] async for lines in AsyncIter(conv): @@ -167,9 +169,15 @@ class MovieTrainer(KaggleTrainer): statements_from_file.append(statement) - if statements_from_file: - self.chatbot.storage.create_many(statements_from_file) - statements_from_file = [] + count += 1 + if count >= save_every: + if statements_from_file: + self.chatbot.storage.create_many(statements_from_file) + statements_from_file = [] + count = 0 + + if statements_from_file: + self.chatbot.storage.create_many(statements_from_file) log.info("Training took", time.time() - start_time, "seconds.") From 802929d757458f9ff4fac99203ced1f033c9bdbc Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 10:02:02 -0400 Subject: [PATCH 17/21] better wording --- chatter/chat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chatter/chat.py b/chatter/chat.py index 65966fa..9e3379f 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -480,7 +480,7 @@ class Chatter(Cog): if not confirmation: await ctx.maybe_send_embed( - "Warning: This command downloads ~800 then eats your CPU for training\n" + "Warning: This command downloads ~800MB and is CPU intensive during training\n" "If you're sure you want to continue, run `[p]chatter train kaggle ubuntu True`" ) return @@ -506,7 +506,7 @@ class Chatter(Cog): if not confirmation: await ctx.maybe_send_embed( - "Warning: This command downloads ~800 then eats your CPU for training\n" + "Warning: This command downloads ~29MB and is CPU intensive during training\n" "If you're sure you want to continue, run `[p]chatter train kaggle movies True`" ) return @@ -527,7 +527,7 @@ class Chatter(Cog): if not confirmation: await ctx.maybe_send_embed( - "Warning: This command downloads ~500MB then eats your CPU for training\n" + "Warning: This command downloads ~500MB and is CPU intensive during training\n" "If you're sure you want to continue, run `[p]chatter train ubuntu True`" ) return From 1319d98972e0b79677a34585fc0b1be2786802e2 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 10:56:48 -0400 Subject: [PATCH 18/21] Less often, still writing too much. --- chatter/trainers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatter/trainers.py b/chatter/trainers.py index adf042f..4f80b79 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -142,7 +142,7 @@ class MovieTrainer(KaggleTrainer): # # lines_dict = {row[0].strip('"'): row[4] for row in reader_list} statements_from_file = [] - save_every = 50 + save_every = 300 count = 0 # [characterID of first, characterID of second, movieID, list of utterances] From db24bb4db4f81d2248b82219d7953798be4dc585 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 10:57:35 -0400 Subject: [PATCH 19/21] No differences --- chatter/chat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chatter/chat.py b/chatter/chat.py index 9e3379f..fe50588 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -262,7 +262,9 @@ class Chatter(Cog): await self.config.guild(ctx.guild).reply.set(toggle) if toggle: - await ctx.maybe_send_embed("I will now respond to you if conversation continuity is not present") + await ctx.maybe_send_embed( + "I will now respond to you if conversation continuity is not present" + ) else: await ctx.maybe_send_embed( "I will not reply to your message if conversation continuity is not present, anymore" From 87187abbb3423fc6539864c4239b858d54b280e7 Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 11:11:57 -0400 Subject: [PATCH 20/21] Fix logging --- chatter/trainers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chatter/trainers.py b/chatter/trainers.py index 4f80b79..3cc92da 100644 --- a/chatter/trainers.py +++ b/chatter/trainers.py @@ -179,7 +179,7 @@ class MovieTrainer(KaggleTrainer): if statements_from_file: self.chatbot.storage.create_many(statements_from_file) - log.info("Training took", time.time() - start_time, "seconds.") + log.info(f"Training took {time.time() - start_time} seconds.") async def asynctrain(self, *args, **kwargs): extracted_lines = self.data_directory / "movie_lines.tsv" @@ -302,7 +302,7 @@ class UbuntuCorpusTrainer2(KaggleTrainer): if statements_from_file: self.chatbot.storage.create_many(statements_from_file) - log.info("Training took", time.time() - start_time, "seconds.") + log.info(f"Training took {time.time() - start_time} seconds.") class TwitterCorpusTrainer(Trainer): From e1297a4dcaec7b12bc1958a728f47d96cfdac5dc Mon Sep 17 00:00:00 2001 From: bobloy Date: Thu, 25 Mar 2021 11:12:05 -0400 Subject: [PATCH 21/21] Return success value --- chatter/chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatter/chat.py b/chatter/chat.py index fe50588..d999d94 100644 --- a/chatter/chat.py +++ b/chatter/chat.py @@ -180,7 +180,7 @@ class Chatter(Cog): async def _train_movies(self): trainer = MovieTrainer(self.chatbot, cog_data_path(self)) - await trainer.asynctrain() + return await trainer.asynctrain() async def _train_ubuntu2(self, intensity): train_kwarg = {}