diff --git a/README.md b/README.md index 3890e9b0..5014812a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ [![license](https://img.shields.io/badge/license-MIT-green)](https://github.com/CornellNLP/ConvoKit/blob/master/LICENSE.md) [![Discord Community](https://img.shields.io/static/v1?logo=discord&style=flat&color=red&label=discord&message=community)](https://discord.gg/WMFqMWgz6P) -This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn. Several large [conversational datasets](https://github.com/CornellNLP/ConvoKit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [3.5.0](https://github.com/CornellNLP/ConvoKit/releases/tag/v3.5.0) (released Oct. 15, 2025); follow the [project on GitHub](https://github.com/CornellNLP/ConvoKit) to keep track of updates. +This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn. Several large [conversational datasets](https://github.com/CornellNLP/ConvoKit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [3.6.0](https://github.com/CornellNLP/ConvoKit/releases/tag/v3.6.0) (released Oct. 25, 2025); follow the [project on GitHub](https://github.com/CornellNLP/ConvoKit) to keep track of updates. Join our [Discord community](https://discord.gg/WMFqMWgz6P) to stay informed, connect with fellow developers, and be part of an engaging space where we share progress, discuss features, and tackle issues together. diff --git a/convokit/__init__.py b/convokit/__init__.py index 9eab81a1..9bd3462b 100644 --- a/convokit/__init__.py +++ b/convokit/__init__.py @@ -32,6 +32,8 @@ "utterance_likelihood": ".utterance_likelihood", "speaker_convo_helpers": ".speaker_convo_helpers", "politeness_collections": ".politeness_collections", + "genai": ".genai", + "convo_similarity": ".convo_similarity", "talktimesharing": ".talktimesharing", } diff --git a/convokit/convo_similarity/__init__.py b/convokit/convo_similarity/__init__.py new file mode 100644 index 00000000..7f3df958 --- /dev/null +++ b/convokit/convo_similarity/__init__.py @@ -0,0 +1,12 @@ +from .scd import SCD +from .condyns import ConDynS +from .naive_condyns import NaiveConDynS +from .baseline import ConDynSBaselines +from .utils import * + +__all__ = [ + "SCD", + "ConDynS", + "NaiveConDynS", + "ConDynSBaselines", +] diff --git a/convokit/convo_similarity/baseline.py b/convokit/convo_similarity/baseline.py new file mode 100644 index 00000000..38221e4a --- /dev/null +++ b/convokit/convo_similarity/baseline.py @@ -0,0 +1,208 @@ +import re +import json +from sentence_transformers import SentenceTransformer, util +from evaluate import load + +try: + from convokit.genai import get_llm_client + + GENAI_AVAILABLE = True +except ImportError: + GENAI_AVAILABLE = False + + +class ConDynSBaselines: + """A class providing baseline methods for computing conversation similarity to compare with ConDynS. + + This class provides various baseline methods for comparing conversations including + BERTScore, cosine similarity using sentence embeddings, and GPT-based comparison methods. + The baseline methods are used to compare with ConDynS. + + :param model_provider: The GenAI provider to use (e.g., "gpt", "gemini") + :param model: Optional specific model name + :param sentence_transformer_model: Sentence transformer model to use for embeddings (default: "all-MiniLM-L6-v2") + :param device: Device to use for sentence transformer (default: "cpu") + """ + + def __init__( + self, + model_provider: str, + config, + model: str = None, + sentence_transformer_model: str = "all-MiniLM-L6-v2", + device: str = "cpu", + ): + """Initialize the ConDynSBaselines with specified models and configurations. + + :param model_provider: The GenAI provider to use (e.g., "gpt", "gemini") + :param model: Optional specific model name + :param sentence_transformer_model: Sentence transformer model to use for embeddings + :param device: Device to use for sentence transformer + :raises ImportError: If required dependencies are not available + """ + if not GENAI_AVAILABLE: + raise ImportError( + "GenAI dependencies not available. Please install via `pip install convokit[genai]`." + ) + + self.model_provider = model_provider + self.model = model + self.sentence_transformer_model = sentence_transformer_model + self.device = device + self.client = get_llm_client(model_provider, config, model=model) + self.st_model = SentenceTransformer(sentence_transformer_model, device=device) + self.util = util + self.bertscore = load("bertscore") + + def get_bertscore(self, pred, ref): + """Compute BERTScore between prediction and reference texts. + + Uses the BERTScore metric to evaluate semantic similarity between two texts. + + :param pred: Prediction text to evaluate + :param ref: Reference text to compare against + :return: BERTScore computation results + """ + a = [pred] + b = [ref] + return self.bertscore.compute( + predictions=a, references=b, model_type="distilbert-base-uncased" + ) + + def get_cosine_similarity(self, pred, ref): + """Compute cosine similarity between two texts using sentence embeddings. + + Uses the SentenceTransformer model to generate embeddings and computes + cosine similarity between them. + + :param pred: First text for comparison + :param ref: Second text for comparison + :return: Cosine similarity score between 0 and 1 + """ + embeddings = self.st_model.encode([pred, ref], convert_to_tensor=True) + similarity = self.util.cos_sim(embeddings[0], embeddings[1]) + return similarity.item() + + def _parse_gpt_responses(self, response): + """Parse and clean model responses containing JSON. + + Extracts JSON content from model responses that may contain markdown formatting + and handles potential parsing errors. + + :param response: Raw response text from model + :return: Parsed JSON data as dictionary + """ + clean_json_str = re.sub(r"```json|```", "", response).strip() + try: + parsed_data = json.loads(clean_json_str) + except json.JSONDecodeError: + print(f"Error decoding JSON for response: {response}") # Debugging output if needed + return parsed_data + + def get_gpt_compare_score(self, pred, ref, prompt): + """Compare two texts using GPT model with a custom prompt. + + Sends a formatted prompt to GPT model to compare two texts and returns + similarity score and reasoning. + + :param pred: First text for comparison + :param ref: Second text for comparison + :param prompt: Prompt template to use for comparison + :return: Tuple of (similarity_score, reasoning) + """ + gpt_prompt = prompt.format(pred=pred, ref=ref) + response = self.client.generate(gpt_prompt) + parsed_response = self._parse_gpt_responses(response) + score = parsed_response["sim_score"] + reason = parsed_response["reason"] + return score, reason + + def get_naive_gpt_compare_score_SCDs(self, scd1, scd2): + """Compare two Summary of Conversation Dynamics (SCD) using GPT. + + Compares two SCD summaries and rates their similarity based on persuasion + trajectory and conversational dynamics, ignoring specific topics or claims. + + :param scd1: First SCD summary + :param scd2: Second SCD summary + :return: Tuple of (similarity_score, reasoning) + """ + naive_gpt_compare_scd_prompt = """Compare the following two summary of conversation dynamics (SCD) of two online conversations, rate the similarity of the two conversations on a scale from 1 to 100, based on their persuasion trajectory reflected in the SCDs. + +### **Key Aspects of Persuasion Trajectory** +- **Persuasion Strategies**: Logical reasoning, emotional appeals, rhetorical questions, citing authority, anecdotes, hypothetical scenarios, refuting counterarguments, shifting burden of proof, repetition, framing, social proof. +- **Interaction Dynamics**: Engagement patterns (e.g., single argument vs. back-and-forth), timing effects, persistence vs. resistance. +- **Response to Persuasion**: Agreement, concession, skepticism, counter-argument, disengagement, linguistic indicators (e.g., hedging, intensity, pronouns). +- **Trajectory of Persuasion**: Gradual shift, immediate agreement, persistent resistance, partial concession, reversal, stalemate. + +### **Ignore**: +- **Do not consider specific topics, claims, or arguments. + +### **Output Requirements** +Return a JSON object containing: +- `"sim_score"` (int): A similarity score between 0-100, representing how similar the conversations themselves are in **trajectory** based on the SCDs. +- `"reason"` (string, ≤30 words): A brief explanation of why the score was given, referencing key conversational dynamics. + +### **Output Format (JSON)** +```json +{{ + "sim_score": , + "reason": "" +}} + +### **Conversations** +Conversation 1 SCD: +{pred} + +Conversation 2 SCD: +{ref} +""" + score, reason = self.get_gpt_compare_score(scd1, scd2, naive_gpt_compare_scd_prompt) + return score, reason + + def get_naive_gpt_compare_score_Transcripts(self, transcript1, transcript2): + """Compare two conversation transcripts using GPT. + + Compares two conversation transcripts and rates their similarity based on + conversational trajectory and dynamics, ignoring specific topics discussed. + + :param transcript1: First conversation transcript + :param transcript2: Second conversation transcript + :return: Tuple of (similarity_score, reasoning) + """ + naive_gpt_compare_transcript_prompt = """Compare the following two online conversations and rate their similarity on a scale from 1 to 100, based on their trajectory. + +### **Definition of Trajectory** +The trajectory of a conversation refers to its **dynamics**, including: +- **Changes in tone** (e.g., neutral to argumentative, formal to casual, sarcastic or sincere). +- **Patterns of interaction** (e.g., back-and-forth exchanges, long monologues, interruptions). +- **Conversation strategies** (e.g., persuasion, questioning, storytelling). +- **Order of the above trajectory events** + +### **Ignore**: +- The topics discussed. +- Specific factual content. + +### **Output Requirements** +Return a JSON object containing: +- `"sim_score"` (int): A similarity score between 0-100, representing how similar the conversations are in **trajectory**. +- `"reason"` (string, ≤30 words): A brief explanation of why the score was given, referencing key conversational dynamics. + +### **Output Format (JSON)** +```json +{{ + "sim_score": , + "reason": "" +}} + +### **Conversations** +Conversation 1: +{pred} + +Conversation 2: +{ref} +""" + score, reason = self.get_gpt_compare_score( + transcript1, transcript2, naive_gpt_compare_transcript_prompt + ) + return score, reason diff --git a/convokit/convo_similarity/condyns.py b/convokit/convo_similarity/condyns.py new file mode 100644 index 00000000..b43289d1 --- /dev/null +++ b/convokit/convo_similarity/condyns.py @@ -0,0 +1,318 @@ +import ast +import numpy as np +import os +import re + +try: + from convokit.genai import get_llm_client + + GENAI_AVAILABLE = True +except ImportError: + GENAI_AVAILABLE = False + + +class ConDynS: + """A class to compute ConDynS score between conversations. + + ConDynS computes similarity scores between conversations by analyzing their + Summary of Conversation Dynamics (SCD) patterns, which are extracted from the SCD + as the Sequence of Patterns (SoP), and comparing them with conversation transcripts. + The method uses bidirectional similarity computation to capture the full dynamics + of both conversations. + + :param model_provider: The LLM provider to use (e.g., "gpt", "gemini") + :param config: The GenAIConfigManager instance to use + :param model: Optional specific model name + :param custom_condyns_prompt: Custom prompt for the condyns prompt template + :param custom_prompt_dir: Directory to save custom prompts (if not provided, overwrites default prompts in ./prompts) + """ + + CONDYNS_PROMPT_TEMPLATE = None + + @classmethod + def _load_prompts(cls): + """Lazy load prompts into class variables.""" + if cls.CONDYNS_PROMPT_TEMPLATE is None: + base_path = os.path.dirname(__file__) + with open( + os.path.join(base_path, "prompts/condyns_prompt.txt"), "r", encoding="utf-8" + ) as f: + cls.CONDYNS_PROMPT_TEMPLATE = f.read() + + def __init__( + self, + model_provider: str, + config, + model: str = None, + custom_condyns_prompt: str = None, + custom_prompt_dir: str = None, + ): + """Initialize the ConDynS score calculator with a specified model provider and optional model name. + + If no model is specified, defaults to our selected default model. + + :param model_provider: The LLM provider to use (e.g., "gpt", "gemini") + :param config: The GenAIConfigManager instance to use + :param model: Optional specific model name + :param custom_condyns_prompt: Custom prompt for the condyns prompt template + :param custom_prompt_dir: Directory to save custom prompts (if not provided, overwrites defaults in ./prompts) + :raises ImportError: If genai dependencies are not available + """ + if not GENAI_AVAILABLE: + raise ImportError( + "GenAI dependencies not available. Please install via `pip install convokit[genai]`." + ) + + self.model_provider = model_provider + self.config = config + self.model = model + self.custom_prompt_dir = custom_prompt_dir + + # Load default prompts first + self._load_prompts() + + # Override with custom prompts if provided + if custom_condyns_prompt is not None: + self.CONDYNS_PROMPT_TEMPLATE = custom_condyns_prompt + if custom_prompt_dir: + self._save_custom_prompt("condyns_prompt.txt", custom_condyns_prompt) + else: + self._save_custom_prompt_to_default("condyns_prompt.txt", custom_condyns_prompt) + + if model is not None: + self.client = get_llm_client(model_provider, config, model=model) + else: + self.client = get_llm_client(model_provider, config) + + def _save_custom_prompt(self, filename: str, prompt_content: str): + """Save custom prompt to the specified directory. + + :param filename: Name of the file to save + :param prompt_content: Content of the prompt to save + """ + if self.custom_prompt_dir: + os.makedirs(self.custom_prompt_dir, exist_ok=True) + filepath = os.path.join(self.custom_prompt_dir, filename) + with open(filepath, "w", encoding="utf-8") as f: + f.write(prompt_content) + + def _save_custom_prompt_to_default(self, filename: str, prompt_content: str): + """Save custom prompt to the default prompts directory. + + :param filename: Name of the file to save + :param prompt_content: Content of the prompt to save + """ + base_path = os.path.dirname(__file__) + filepath = os.path.join(base_path, "prompts", filename) + with open(filepath, "w", encoding="utf-8") as f: + f.write(prompt_content) + + def set_custom_condyns_prompt(self, prompt_text: str, save_to_file: bool = True): + """Set a custom condyns prompt template. + + :param prompt_text: The custom prompt text + :param save_to_file: Whether to save the prompt to file in custom_prompt_dir or default prompts directory + """ + self.CONDYNS_PROMPT_TEMPLATE = prompt_text + if save_to_file: + if self.custom_prompt_dir: + self._save_custom_prompt("condyns_prompt.txt", prompt_text) + else: + self._save_custom_prompt_to_default("condyns_prompt.txt", prompt_text) + + def load_custom_prompts_from_directory(self, prompt_dir: str): + """Load custom prompts from a specified directory. + + :param prompt_dir: Directory containing custom prompt files + """ + condyns_path = os.path.join(prompt_dir, "condyns_prompt.txt") + + if os.path.exists(condyns_path): + with open(condyns_path, "r", encoding="utf-8") as f: + self.CONDYNS_PROMPT_TEMPLATE = f.read() + + def _clean_model_output_to_dict(self, text: str) -> dict: + """Clean and parse model output into a dictionary. + + Extracts dictionary content from model responses and handles common + formatting issues for safe parsing. + + :param text: Raw model output text + :return: Parsed dictionary from the model output + :raises ValueError: If no valid dictionary boundaries are found + """ + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end <= start: + raise ValueError("No valid dictionary boundaries found.") + + dict_str = text[start : end + 1] + dict_str = re.sub(r"'s\b", "s", dict_str) + dict_str = re.sub(r"'t\b", "t", dict_str) + dict_str = re.sub(r"'ve\b", "ve", dict_str) + return ast.literal_eval(dict_str) + + def get_condyns_score(self, transcript1, transcript2, sop1, sop2): + """Compute ConDynS score between two conversations. + + Computes ConDynS with the bidirectional similarity between two conversations using their + transcripts and SoPs, then returns the mean score. + + :param transcript1: First conversation transcript + :param transcript2: Second conversation transcript + :param sop1: SoP for first conversation + :param sop2: SoP for second conversation + :return: ConDynS score + """ + condyns_score = self.compute_bidirectional_similarity(transcript1, transcript2, sop1, sop2) + return condyns_score, np.mean(self.compute_score_from_results(condyns_score)) + + def compute_unidirectional_similarity(self, sop1, transcript2): + """Compute unidirectional similarity between SoPs and a transcript. + + Analyzes how well the SoPs from one conversation match the dynamics + observed in another conversation's transcript. + + :param sop1: SoPs from the first conversation + :param transcript2: Conversation transcript from the second conversation + :return: Dictionary with analysis and scores for each event in sop1 + """ + # Format the prompt with the events and transcript + full_prompt = self.CONDYNS_PROMPT_TEMPLATE.format(events=sop1, transcript=transcript2) + + response = self.client.generate(full_prompt) + try: + response_dict = self._clean_model_output_to_dict(response.text) + except (SyntaxError, ValueError) as e: + print(response.text) + print("Error parsing output:", e) + raise Exception("error parsing") + return response_dict + + def compute_bidirectional_similarity(self, transcript1, transcript2, sop1, sop2): + """Compute bidirectional similarity between two conversations. + + Computes similarity in both directions: SoP1 vs Transcript2 and SoP2 vs Transcript1 + to capture the full dynamics of both conversations. + + :param transcript1: First conversation transcript + :param transcript2: Second conversation transcript + :param sop1: SoP for first conversation + :param sop2: SoP for second conversation + :return: List of [response_dict1, response_dict2] where each dict contains + analysis and scores for each event + """ + response_dict1 = self.compute_unidirectional_similarity(sop1, transcript2) + response_dict2 = self.compute_unidirectional_similarity(sop2, transcript1) + return [response_dict1, response_dict2] + + def measure_score(self, data): + """Calculate the mean score from a similarity result dictionary. + + :param data: Dictionary containing similarity analysis results + :return: Mean score across all events + """ + sum_score = [] + for item in data.values(): + sum_score.append(item["score"]) + return np.mean(sum_score) + + def compute_score_from_results(self, results): + """Compute scores from bidirectional similarity results. + + :param results: List of bidirectional similarity results + :return: List of mean scores for each direction + """ + scores = [] + for result in results: + scores.append(self.measure_score(result)) + return scores + + def _format_conversation_to_transcript(self, conversation): + """Format a ConvoKit conversation into a transcript string. + + Converts a conversation into a formatted transcript suitable for ConDynS analysis. + Uses chronological order and assigns speaker labels. + + :param conversation: ConvoKit Conversation object + :return: Formatted transcript string + """ + utt_list = conversation.get_chronological_utterance_list() + transcript_lines = [] + speaker_map = {} + speaker_counter = 1 + + for utt in utt_list: + # Assign speaker labels (SPEAKER1, SPEAKER2, etc.) + if utt.speaker.id not in speaker_map: + speaker_map[utt.speaker.id] = f"SPEAKER{speaker_counter}" + speaker_counter += 1 + + speaker_label = speaker_map[utt.speaker.id] + transcript_lines.append(f"{speaker_label}: {utt.text}") + + return " ".join(transcript_lines) + + def compare_conversations( + self, corpus, convo_id1: str, convo_id2: str, sop_meta_name: str, formatter=None + ): + """Compare two conversations using ConDynS and store the result in both conversations' metadata. + + This method retrieves two conversations from the corpus, formats them into transcripts, + extracts their SoP data from metadata, computes the ConDynS score between them, and stores + the result in both conversations' metadata with the key format "condyns_{convo_id1}_{convo_id2}". + + :param corpus: The ConvoKit Corpus containing the conversations + :param convo_id1: ID of the first conversation + :param convo_id2: ID of the second conversation + :param sop_meta_name: Name of the metadata field containing SoP data + :param formatter: Optional custom formatter function that takes a Conversation object and returns a transcript string. + If None, uses the default formatter. + :return: The computed ConDynS score + :raises KeyError: If conversations don't exist or required metadata is missing + :raises ValueError: If SoP data is malformed + :raises TypeError: If custom formatter is not callable + """ + # Get conversations from corpus + try: + convo1 = corpus.get_conversation(convo_id1) + convo2 = corpus.get_conversation(convo_id2) + except KeyError as e: + raise KeyError(f"Conversation not found in corpus: {e}") + + # Validate custom formatter if provided + if formatter is not None and not callable(formatter): + raise TypeError("Custom formatter must be a callable function") + + # Format conversations into transcripts using custom or default formatter + if formatter is not None: + transcript1 = formatter(convo1) + transcript2 = formatter(convo2) + else: + transcript1 = self._format_conversation_to_transcript(convo1) + transcript2 = self._format_conversation_to_transcript(convo2) + + # Extract SoP data from metadata + try: + sop1 = convo1.meta[sop_meta_name] + sop2 = convo2.meta[sop_meta_name] + except KeyError as e: + raise KeyError(f"SoP metadata '{sop_meta_name}' not found in conversation: {e}") + + # Compute ConDynS score + result, condyns_score = self.get_condyns_score(transcript1, transcript2, sop1, sop2) + + # Store the score in both conversations' metadata + score_key1 = f"condyns_{convo_id1}_{convo_id2}" + score_key2 = f"condyns_{convo_id2}_{convo_id1}" + + result_key1 = f"condyns_result_{convo_id1}_{convo_id2}" + result_key2 = f"condyns_result_{convo_id2}_{convo_id1}" + + convo1.meta[result_key1] = result + convo2.meta[result_key2] = result + + convo1.meta[score_key1] = condyns_score + convo2.meta[score_key2] = condyns_score + + return result, condyns_score diff --git a/convokit/convo_similarity/examples/applications/applications.ipynb b/convokit/convo_similarity/examples/applications/applications.ipynb new file mode 100644 index 00000000..714e421d --- /dev/null +++ b/convokit/convo_similarity/examples/applications/applications.ipynb @@ -0,0 +1,2644 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4085d1b5", + "metadata": {}, + "source": [ + "# Applications with ConDynS\n", + "\n", + "This notebook demonstrates how to apply ConDynS to analyze conversational dynamics, replicating the application results discussed in Section 6 of our [paper: A Similarity Measure for Comparing Conversational Dynamics](https://arxiv.org/abs/2507.18956). It includes examples of clustering conversations, comparing inter- and intra-group similarity, and examining which speaker drives the conversation’s dynamics. The notebook serves as a quick reference for using the metric on other datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab9a225a-9b45-4043-bf69-3a8c24d2fd94", + "metadata": {}, + "outputs": [], + "source": [ + "from convokit import Corpus, download\n", + "from tqdm import tqdm\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams.update({'font.size': 13})\n", + "import nltk\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import sent_tokenize\n", + "import numpy as np\n", + "import re, os, random, time, math, json, string\n", + "import scipy.stats as stats\n", + "from itertools import combinations\n", + "from datetime import datetime\n", + "from concurrent.futures import ThreadPoolExecutor, as_completed\n", + "from scipy.stats import mannwhitneyu\n", + "from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n", + "from scipy.spatial.distance import squareform\n", + "from sklearn.feature_extraction.text import CountVectorizer as CV\n", + "\n", + "from convokit.genai.genai_config import GenAIConfigManager\n", + "from convokit.convo_similarity.scd import SCD\n", + "from convokit.convo_similarity.condyns import ConDynS\n", + "\n", + "random.seed(4300)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b7b6f5f", + "metadata": {}, + "outputs": [], + "source": [ + "### Setup path for data and corpus ###\n", + "DATA_PATH = \"./data\"\n", + "\n", + "### Set up config for GenAI ###\n", + "config = GenAIConfigManager() ### make sure to set your own config if this is never set before\n", + "\n", + "### Select which model provider to use for ConDynS ###\n", + "MODEL_PROVIDER = \"gemini\"\n", + "MODEL = \"gemini-2.0-flash-001\"\n", + "config.set_google_cloud_config(\"YOUR PROJECT\", \"YOUR LOCATION\")" + ] + }, + { + "cell_type": "markdown", + "id": "260a1ff3", + "metadata": {}, + "source": [ + "### Experiment Setup\n", + "\n", + "We first focus on conversations from the ChangeMyView sub-Reddit. We need to annotate delta information for conversations to prepare for conversation selections later. We would also define methods to filter out invalid conversations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04756fd5", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = Corpus(filename=download(\"subreddit-changemyview\", data_dir=DATA_PATH))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ede2639-1f73-4f9f-802a-44424f247e5c", + "metadata": {}, + "outputs": [], + "source": [ + "### Annotate Corpus with Delta Information Based on DeltaBot ###\n", + "for utt in tqdm(corpus.iter_utterances()):\n", + " if (\n", + " utt.reply_to is not None\n", + " and utt.speaker.id == \"DeltaBot\"\n", + " and \"delta awarded\" in utt.text\n", + " ):\n", + " deltabot_text = utt.text\n", + " match = re.search(\n", + " r\"(?:Confirmed: 1 delta awarded to )(?:\\/)?(?:u\\/)([\\w-]+)\", deltabot_text\n", + " )\n", + " if match is not None:\n", + " try:\n", + " delta_utt = corpus.get_utterance(utt.reply_to)\n", + " delta_utt.meta['got_delta'] = True\n", + " except KeyError:\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fff0a9c6-8c4b-4409-b0bf-c042cb33c9d2", + "metadata": {}, + "outputs": [], + "source": [ + "### Remove Speakers that has delta / # convo ratio > 1 or had more than 10000 conversations (likely bots) ###\n", + "invalid_speakers = ['Cou', 'rightplacewr0ngtime', 'ThursdayTrashTyrant', 'UggoJesus', 'WideLight', 'urge_to_merge', 'LukeJovanovic'] + ['hacksoncode', 'Ansuz07', 'tbdabbholm'] \n", + "### Remove Convos that contain toxic content which prevents model from processing ###\n", + "toxic_convos = ['unbvfc_d', 'cvs2dx_c', 'brshwd_d', 'vhmoje_b', 'qdd7k1_b', '3mhbgl_d', 'xr0bxq_b']\n", + "\n", + "### Controlling for the length of the conversation ###\n", + "# def is_correct_length_convo(utt_lst, low_lim=5, high_lim=7):\n", + "# num_of_utt = len(utt_lst)\n", + "# return num_of_utt >= low_lim and num_of_utt <= high_lim\n", + "def is_correct_length_convo(utt_lst, low_lim=5):\n", + " return len(utt_lst) >= low_lim\n", + "\n", + "### Controlling for the number of speakers ###\n", + "def is_two_speaker_convo(utt_lst):\n", + " speaker_lst = []\n", + " for utt in utt_lst:\n", + " if utt.speaker.id not in speaker_lst:\n", + " speaker_lst.append(utt.speaker.id)\n", + " if len(speaker_lst) > 2:\n", + " return False\n", + " return len(speaker_lst) == 2\n", + "\n", + "### Filtering out invalid convos ###\n", + "def is_valid_convo(convo, utt_lst, invalid_sp=invalid_speakers, toxic_convo=toxic_convos):\n", + " if convo.id in toxic_convo or not is_correct_length_convo(utt_lst) or not is_two_speaker_convo(utt_lst):\n", + " return False\n", + "\n", + " for utt in utt_lst:\n", + " if utt.speaker.id in invalid_sp:\n", + " return False\n", + " for utt in utt_lst:\n", + " if utt.text == \"[deleted]\":\n", + " return False\n", + " return True\n", + "\n", + "### Helper functions ###\n", + "def get_all_speakers(utt_lst):\n", + " speaker_lst = []\n", + " for utt in utt_lst:\n", + " speaker_lst.append(utt.speaker.id)\n", + " return speaker_lst\n", + "\n", + "def get_convo_year(utt_lst):\n", + " timestamp = utt_lst[1].meta['retrieved_on']\n", + " time = datetime.utcfromtimestamp(timestamp)\n", + " if time.year > 2014:\n", + " return time.year\n", + " else:\n", + " for utt in utt_lst:\n", + " timestamp = utt.meta['retrieved_on']\n", + " time = datetime.utcfromtimestamp(timestamp)\n", + " if time.year > 2014:\n", + " return time.year\n", + " return None\n", + "\n", + "def read_convo(utt_lst):\n", + " for utt in utt_lst:\n", + " print(f\"{utt.speaker.id} : {utt.text}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "51ead648-69c4-4db7-829a-ab3c06498e4b", + "metadata": {}, + "source": [ + "### Select Conversations From Year of 2018\n", + "\n", + "Our experiments are conducted on Reddit Data from 2018 so there is no LLM influence on contents. Now we filter out valid conversations from 2018, and select conversations from there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c8e7e21-f83b-4aa4-994c-1502e94247c1", + "metadata": {}, + "outputs": [], + "source": [ + "### Filtering out invalid conversations ###\n", + "random.seed(4300)\n", + "valid_convos = {}\n", + "for convo in tqdm(corpus.iter_conversations()):\n", + " try:\n", + " all_convos = convo.get_root_to_leaf_paths()\n", + " except ValueError:\n", + " continue\n", + " cur_convo_valid_utts = []\n", + " for utt_lst in all_convos:\n", + " if is_valid_convo(convo, utt_lst):\n", + " cur_convo_valid_utts.append(utt_lst)\n", + " if cur_convo_valid_utts:\n", + " cur_convo_utt_lst = random.choice(cur_convo_valid_utts)\n", + " valid_convos[convo.id] = cur_convo_utt_lst\n", + "\n", + "print(\"Number of valid convos:\", len(valid_convos))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dc35b9e-b4cb-49cd-b994-a16766477590", + "metadata": {}, + "outputs": [], + "source": [ + "### Get year 2018 conversations ###\n", + "convo_to_year = {'2018' : []}\n", + "for convo_id, utt_lst in tqdm(valid_convos.items()):\n", + " convo_year = get_convo_year(utt_lst)\n", + " for year, _ in convo_to_year.items():\n", + " if convo_year == int(year):\n", + " convo_to_year[year].append(convo_id)\n", + " break\n", + "\n", + "for year, convos in convo_to_year.items():\n", + " print(f\"{year}: {len(convos)} conversations\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea4e4e06-f4d9-4926-89b0-c0af2d596de8", + "metadata": {}, + "outputs": [], + "source": [ + "### Select conversations from 2018 We always filter a little more to deal with the case when LLM can't handle the input ###\n", + "random.seed(4300)\n", + "K = 220\n", + "used_speakers = []\n", + "selected_convos_each_year = {'2018' : []}\n", + "for year, convo_ids in tqdm(convo_to_year.items()):\n", + " random.shuffle(convo_ids)\n", + " for convo_id in convo_ids:\n", + " convo_speakers = get_all_speakers(valid_convos[convo_id])\n", + " \n", + " if all(s not in used_speakers for s in convo_speakers):\n", + " selected_convos_each_year[year].append(convo_id)\n", + " used_speakers.extend(convo_speakers)\n", + " \n", + " if len(selected_convos_each_year[year]) == K:\n", + " break\n", + "\n", + "for year, convos in selected_convos_each_year.items():\n", + " print(f\"{year}: {len(convos)} conversations\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad9be6ca-fccc-472d-8896-a4d1a6d2619f", + "metadata": {}, + "outputs": [], + "source": [ + "### Constructing the corpus for selected conversations ###\n", + "all_utterances = []\n", + "for convo_id in selected_convos_each_year['2018']:\n", + " utt_lst = [utt for utt in corpus.get_conversation(convo_id).iter_utterances()]\n", + " all_utterances.extend(utt_lst)\n", + "\n", + "selected_corpus = Corpus(utterances=all_utterances)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd2bab74-a942-4d8d-9bbb-f1735b62eea0", + "metadata": {}, + "outputs": [], + "source": [ + "### Annotating the new corpus with metadata information ###\n", + "for convo in tqdm(selected_corpus.iter_conversations()):\n", + " delta_convo = False\n", + " for utt in convo.iter_utterances():\n", + " if 'got_delta' in utt.meta and utt.meta['got_delta']:\n", + " delta_convo = True\n", + " og_convo = corpus.get_conversation(convo.id)\n", + " convo.meta = og_convo.meta\n", + " convo.meta['year'] = '2018'\n", + " convo.meta['has_delta'] = delta_convo\n", + "\n", + "for sp in tqdm(selected_corpus.iter_speakers()):\n", + " og_sp = corpus.get_speaker(sp.id)\n", + " sp.meta = og_sp.meta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbba2710-4f2c-4b4a-9914-b24c4d5a0675", + "metadata": {}, + "outputs": [], + "source": [ + "selected_corpus.dump(\"cmv_selected_convos_2018\", base_path=DATA_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b643d61a-0460-4da0-a48f-ce8962c1c676", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + \"cmv_selected_convo_ids_2018.json\", \"w\") as f:\n", + " json.dump(selected_convos_each_year['2018'], f, indent=4)" + ] + }, + { + "cell_type": "markdown", + "id": "05ef74a7-e042-4082-9055-750532817507", + "metadata": {}, + "source": [ + "## ConDynS Computation - Random 2018 Set\n", + "\n", + "In this section, we compute ConDynS scores for a randomly selected set of 2018 conversations. These scores form the basis for the subsequent analyses, including clustering and group-level comparisons." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05e2c10-9f0f-4748-b65f-14682f30724d", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_PATH = \"./artefacts\"\n", + "CUR_ANALYSIS = \"random_set_2018\"\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}\")\n", + " os.makedirs(f\"{BASE_PATH}\")\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}/{CUR_ANALYSIS}/\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}/{CUR_ANALYSIS}/\")\n", + " os.makedirs(f\"{BASE_PATH}/{CUR_ANALYSIS}/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5f1a75-d644-4883-89cd-df6ec889a63f", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + \"cmv_selected_convo_ids_2018.json\", \"r\") as f:\n", + " convo_2018 = json.load(f)\n", + "\n", + "corpus = Corpus(filename=DATA_PATH + \"cmv_selected_convos_2018\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56e24ddb-1244-41c9-b190-394c561f7337", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cur_convo_transcript(corpus, convo_id):\n", + " convo = corpus.get_conversation(convo_id)\n", + " utt_list = convo.get_chronological_utterance_list()\n", + " transcription = []\n", + " spk_list = {utt_list[0].speaker.id : \"SPEAKER1\"}\n", + " for utt in utt_list:\n", + " if utt.speaker.id not in spk_list.keys():\n", + " spk_list[utt.speaker.id] = \"SPEAKER2\"\n", + " assert len(spk_list) == 2\n", + " transcription.append(spk_list[utt.speaker.id] +\": \"+utt.text)\n", + " transcription = transcription[1:] # remove first OP message\n", + " return transcription" + ] + }, + { + "cell_type": "markdown", + "id": "7f66277f", + "metadata": {}, + "source": [ + "Here, we initialize our modules for writing the SCDs and computing ConDynS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "623734f8", + "metadata": {}, + "outputs": [], + "source": [ + "### Initialize the modules for ConDynS ###\n", + "scd_transformer = SCD(model_provider=MODEL_PROVIDER, model=MODEL, config=config)\n", + "condyns = ConDynS(model_provider=MODEL_PROVIDER, model=MODEL, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f5966d3-330d-4d2a-921c-59b8a3a9c4dd", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_within_group_similarity(all_convos, summaries_and_bullets, similarity_result, incomplete, data_path):\n", + " \"\"\"\n", + " Compute the similarity between all conversations in the same group of conversations using ConDynS.\n", + " This function is build with parallelization in mind to speed up the computation.\n", + " \"\"\"\n", + " all_convos_combos = list(combinations(all_convos, 2))\n", + " len(all_convos_combos)\n", + " \n", + " # ### Calling GPT to create the SCD and bullet points. Run with caution.\n", + " summaries_and_bullets = summaries_and_bullets\n", + " incomplete = incomplete\n", + " similarity_result = similarity_result\n", + " \n", + " # Generate SCDs and SoPs for conversations that don't have them yet\n", + " convos_to_process = [convo_id for convo_id in all_convos if convo_id not in summaries_and_bullets.keys()]\n", + " \n", + " if convos_to_process:\n", + " def convo_selector(conversation):\n", + " return conversation.id in convos_to_process\n", + " \n", + " print(f\"Generating SCDs for {len(convos_to_process)} conversations...\")\n", + " scd_transformer.transform(corpus, selector=convo_selector)\n", + " \n", + " # Extract results from conversation metadata\n", + " for convo_id in convos_to_process:\n", + " convo = corpus.get_conversation(convo_id)\n", + " summary = convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoint = convo.meta.get(\"machine_sop\", \"\")\n", + " summaries_and_bullets.update({convo_id : {\"summary\" : summary, \"bulletpoint\" : bulletpoint}})\n", + " with open(f\"{data_path}summary.json\", \"w\") as file:\n", + " json.dump(summaries_and_bullets, file, indent=4)\n", + "\n", + " def get_bidirection_similarity_with_retry(corpus, convo1_id, convo2_id, summaries_and_bullets, retries=10):\n", + " for i in range(retries):\n", + " try:\n", + " result, score = condyns.compare_conversations(\n", + " corpus, convo1_id, convo2_id, \n", + " sop_meta_name=\"machine_sop\"\n", + " )\n", + " return score, result\n", + " except Exception as e:\n", + " wait = 0.5 ** i + random.random()\n", + " print(f\"Retrying ({convo1_id}, {convo2_id}) after {wait:.2f}s due to error: {e}\")\n", + " incomplete.append(f'{convo1_id}_{convo2_id}')\n", + " time.sleep(wait)\n", + " return None, None\n", + " \n", + " # The thread worker function\n", + " def worker(corpus, convo1_id, convo2_id, summaries_and_bullets):\n", + " score, result = get_bidirection_similarity_with_retry(corpus, convo1_id, convo2_id, summaries_and_bullets)\n", + " return (f'{convo1_id}_{convo2_id}', {\"score\": score, \"result\": result})\n", + "\n", + " MAX_WORKER = 50\n", + " \n", + " # Build task list\n", + " tasks = [(corpus, id1, id2, summaries_and_bullets) \n", + " for id1, id2 in all_convos_combos \n", + " if f'{id1}_{id2}' not in similarity_result and f'{id1}_{id2}' not in incomplete and f'{id2}_{id1}' not in similarity_result and f'{id2}_{id1}' not in incomplete]\n", + " \n", + " # Set up thread pool\n", + " with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", + " futures = [executor.submit(worker, *task) for task in tasks]\n", + " \n", + " for future in tqdm(as_completed(futures), total=len(futures), desc=\"Computing Pair-wise Similarity\"):\n", + " key, value = future.result()\n", + " if key in similarity_result.keys():\n", + " print(\"not good, repeated keys\")\n", + " if key not in incomplete:\n", + " similarity_result[key] = value\n", + "\n", + " with open(f\"{data_path}similarity.json\", \"w\") as file:\n", + " json.dump(similarity_result, file, indent=4)\n", + "\n", + " with open(f\"{data_path}incomplete.json\", \"w\") as file:\n", + " json.dump(incomplete, file, indent=4)\n", + "\n", + " return summaries_and_bullets, similarity_result, incomplete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c95bcc5-764d-4f87-a4e7-9455009e7d0f", + "metadata": {}, + "outputs": [], + "source": [ + "### Compute within-group similarity using ConDynS on selected 2018 conversations ###\n", + "all_convos = convo_2018\n", + "data_path = f\"{BASE_PATH}/{CUR_ANALYSIS}/\"\n", + "\n", + "if os.path.exists(f\"{data_path}summary.json\"):\n", + " with open(f\"{data_path}summary.json\", \"r\") as file:\n", + " summaries_and_bullets = json.load(file)\n", + "else:\n", + " summaries_and_bullets = {}\n", + "\n", + "if os.path.exists(f\"{data_path}similarity.json\"):\n", + " with open(f\"{data_path}similarity.json\", \"r\") as file:\n", + " similarity_result = json.load(file)\n", + "else:\n", + " similarity_result = {}\n", + "\n", + "incomplete = []\n", + "\n", + "summaries_and_bullets, similarity_result, incomplete = compute_within_group_similarity(all_convos, summaries_and_bullets, similarity_result, incomplete, data_path)\n", + "\n", + "print(\"Complete!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c739ecd4-daa3-4564-b8f0-27a8f27acf6a", + "metadata": {}, + "outputs": [], + "source": [ + "### Extract problematic conversations from the computation, expected to be [] ###\n", + "def extract_ids(id_list):\n", + " id_count = {}\n", + " for item in id_list:\n", + " parts = item.split('_')\n", + " mid = len(parts) // 2\n", + " id1 = '_'.join(parts[:mid])\n", + " id2 = '_'.join(parts[mid:])\n", + "\n", + " for id in [id1, id2]:\n", + " if id not in id_count.keys():\n", + " id_count[id] = 0\n", + " id_count[id] += 1\n", + " return id_count\n", + "problem_convos = [id for id, count in extract_ids(incomplete).items()]\n", + "problem_convos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de468801-1fa8-4e14-84ff-f47ac177861a", + "metadata": {}, + "outputs": [], + "source": [ + "after_incomplete = [idx for idx in convo_2018 if idx not in problem_convos]\n", + "random.seed(4300)\n", + "convo_2018 = random.sample(after_incomplete, 200)\n", + "print(len(convo_2018))" + ] + }, + { + "cell_type": "markdown", + "id": "d246a73d-498a-48aa-8301-db479b26d93e", + "metadata": {}, + "source": [ + "## Analysis - Clustering\n", + "\n", + "We use the computed ConDynS scores to cluster the conversation set into two groups and analyze their distinguishing interaction patterns (via fighting words) and the distribution of persuasion outcomes (Δ awards) across clusters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "177031de-2d2c-4312-88b6-a37588a8a7a0", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + \"cmv_selected_convo_ids_2018.json\", \"r\") as f:\n", + " convo_2018 = json.load(f)\n", + "\n", + "all_convos = convo_2018" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "747e3e17-a842-4ecd-9e1e-c5abcf28b773", + "metadata": {}, + "outputs": [], + "source": [ + "### Verify all similarity are computed valid here ###\n", + "with open(f\"{data_path}summary.json\", \"r\") as file:\n", + " summaries_and_bullets = json.load(file)\n", + "for convo_id in all_convos:\n", + " assert convo_id in summaries_and_bullets\n", + "\n", + "with open(f\"{data_path}similarity.json\", \"r\") as file:\n", + " similarity_result = json.load(file)\n", + "all_convos_combos = list(combinations(convo_2018, 2))\n", + "for id1, id2 in all_convos_combos:\n", + " assert f'{id1}_{id2}' in similarity_result or f'{id2}_{id1}' in similarity_result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "651573c1-c489-420d-a26b-629af14840bb", + "metadata": {}, + "outputs": [], + "source": [ + "### Helper functions to read in the results ###\n", + "def get_similarity(convo1, convo2, all_convos, sim_score):\n", + " if convo1 not in all_convos or convo2 not in all_convos:\n", + " raise Exception(\"convo not in selected convo\")\n", + " key = f\"{convo1}_{convo2}\" if f\"{convo1}_{convo2}\" in sim_score.keys() else f\"{convo2}_{convo1}\"\n", + " score = sim_score[key]['score']\n", + " return score\n", + "\n", + "def compute_intra_group_similarity(group, sim_scores):\n", + " similarities = []\n", + " for i in range(len(group)):\n", + " for j in range(i + 1, len(group)):\n", + " sim = np.mean(get_similarity(group[i], group[j], group, sim_scores))\n", + " similarities.append(sim)\n", + " return np.array(similarities)\n", + "\n", + "def two_groups_intra_group_similarity_check(group1, group2, group1_name, group2_name, all_convos, sim_score, y_lim=4):\n", + " group1_similarities = compute_intra_group_similarity(group1, sim_score)\n", + " group2_similarities = compute_intra_group_similarity(group2, sim_score)\n", + " \n", + " group1_mean = np.mean(group1_similarities)\n", + " group2_mean = np.mean(group2_similarities)\n", + "\n", + " plt.figure(figsize=(6, 4))\n", + " plt.hist(group1_similarities, bins=20, alpha=0.5, color='blue', label=f'{group1_name} convos', density=False)\n", + " plt.hist(group2_similarities, bins=20, alpha=0.5, color='red', label=f'{group2_name} convos', density=False)\n", + " \n", + " plt.xlabel(\"similarity score\")\n", + " plt.ylabel(\"number of conversation pairs\")\n", + " # plt.title(f\"Distribution of Similarity Scores for {group1_name} and {group2_name}\")\n", + " plt.legend()\n", + " plt.legend(loc='upper left')\n", + " plt.grid(True, linestyle='--', alpha=0.6)\n", + " plt.xlim(0, 1)\n", + " plt.ylim(0, y_lim)\n", + " # plt.savefig(\"within-group-sim.png\") ### Save figure for submissions\n", + " \n", + " plt.show()\n", + " \n", + " # Compare distributions with statistical tests\n", + " t_stat, p_value = stats.ttest_ind(group1_similarities, group2_similarities, equal_var=False)\n", + " levene_stat, levene_p = stats.levene(group1_similarities, group2_similarities)\n", + " statistic, p_value = mannwhitneyu(group1_similarities, group2_similarities, alternative='two-sided')\n", + "\n", + " print(f\"Mann-Whitney U statistic = {statistic}, p-value = {p_value:.5f}\")\n", + " \n", + " # Print results\n", + " print(f\"{group1_name} Mean Similarity: {group1_mean:.4f}\")\n", + " print(f\"{group2_name} Mean Similarity: {group2_mean:.4f}\")\n", + " print(f\"T-Test p-value (Are means different?): {p_value:.4f}\")\n", + " print(f\"Levene's Test p-value (Is variance different?): {levene_p:.4f}\")\n", + "\n", + "def compute_between_group_similarity(group1, group2, all_convos, sim_score):\n", + " similarities = []\n", + " for convo1 in group1:\n", + " for convo2 in group2:\n", + " sim = np.mean(get_similarity(convo1, convo2, all_convos, sim_score))\n", + " similarities.append(sim)\n", + " return np.array(similarities)\n", + " \n", + "def between_group_similarity_check(group1, group2, group1_name, group2_name, all_convos, sim_score, y_lim=4):\n", + " between_similarities = compute_between_group_similarity(group1, group2, all_convos, sim_score)\n", + " between_mean = np.mean(between_similarities)\n", + " plt.figure(figsize=(6, 4))\n", + " plt.hist(between_similarities, bins=20, alpha=0.7, color='purple', label=f'{group1_name} vs {group2_name}', density=False)\n", + " plt.xlabel(\"similarity score\")\n", + " plt.ylabel(\"number of conversation pairs\")\n", + " plt.title(f\"Between-Group Similarity: {group1_name} vs {group2_name}\")\n", + " plt.legend()\n", + " plt.grid(True, linestyle='--', alpha=0.6)\n", + " plt.xlim(0, 1)\n", + " plt.ylim(0, y_lim)\n", + " plt.show()\n", + " print(f\"Between-Group Mean Similarity ({group1_name} vs {group2_name}): {between_mean:.4f}\")\n", + "\n", + "def compute_intra_group_similarity_plot(group, group_name, sim_scores, y_lim=2000):\n", + " similarities = []\n", + " for i in range(len(group)):\n", + " for j in range(i + 1, len(group)):\n", + " sim = np.mean(get_similarity(group[i], group[j], group, sim_scores))\n", + " similarities.append(sim)\n", + "\n", + " plt.figure(figsize=(6, 4))\n", + " plt.hist(np.array(similarities), bins=20, alpha=0.5, color='red', label=f'{group_name}', density=False)\n", + " plt.xlabel(\"similarity score\")\n", + " plt.ylabel(\"number of conversation pairs\")\n", + " # plt.title(f\"Distribution of Similarity Scores for {group1_name} and {group2_name}\")\n", + " plt.legend()\n", + " plt.legend(loc='upper left')\n", + " plt.grid(True, linestyle='--', alpha=0.6)\n", + " plt.xlim(0, 1)\n", + " plt.ylim(0, y_lim)\n", + " # plt.savefig(\"within-group-sim-random-set.png\") ### Save figure for submissions\n", + " print(np.mean(np.array(similarities)))\n", + " plt.show()\n", + " mean_val = np.mean(similarities)\n", + " median_val = np.median(similarities)\n", + " percentile_25 = np.percentile(similarities, 25)\n", + " percentile_75 = np.percentile(similarities, 75)\n", + " print(f\"{group_name} Similarity Stats:\")\n", + " print(f\"Mean: {mean_val:.4f}\")\n", + " print(f\"Median: {median_val:.4f}\")\n", + " print(f\"25th pct: {percentile_25:.4f}\")\n", + " print(f\"75th pct: {percentile_75:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50cea4f7-a94e-42e4-b697-02afa0843a7a", + "metadata": {}, + "outputs": [], + "source": [ + "compute_intra_group_similarity_plot(convo_2018, \"random set\", similarity_result, y_lim=2500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "073e74e7-14c8-401f-90d1-7dfede6b9cc8", + "metadata": {}, + "outputs": [], + "source": [ + "### Helper functions for clustering conversations ###\n", + "from matplotlib import pyplot as plt\n", + "\n", + "def cluster_conversations(convo_ids, threshold=5):\n", + " \"\"\"\n", + " Performs hierarchical clustering on a list of conversation IDs based on pairwise similarity.\n", + " Parameters:\n", + " - convo_ids (list): List of conversation identifiers.\n", + " Returns:\n", + " - linkage_matrix (ndarray): Linkage matrix from hierarchical clustering.\n", + " \"\"\"\n", + " n = len(convo_ids)\n", + " distance_matrix = np.zeros((n, n))\n", + "\n", + " for i in range(n):\n", + " for j in range(i + 1, n):\n", + " convo1, convo2 = convo_ids[i], convo_ids[j]\n", + " try:\n", + " similarity = np.mean(get_similarity(convo1, convo2, convo_2018, similarity_result))\n", + " distance = 1 - similarity\n", + " distance_matrix[i, j] = distance_matrix[j, i] = distance\n", + " except Exception as e:\n", + " # distance_matrix[i, j] = distance_matrix[j, i] = 1 # Max distance if error\n", + " print(e)\n", + "\n", + " condensed_dist_matrix = squareform(distance_matrix)\n", + " linkage_matrix = linkage(condensed_dist_matrix, method=\"ward\")\n", + "\n", + " cluster_labels = fcluster(linkage_matrix, threshold, criterion='distance')\n", + "\n", + " clusters = {}\n", + " for convo, label in zip(convo_ids, cluster_labels):\n", + " if label not in clusters:\n", + " clusters[label] = []\n", + " clusters[label].append(convo)\n", + " \n", + " cluster_lists = list(clusters.values())\n", + "\n", + " plt.figure(figsize=(12, 6))\n", + " dendrogram(linkage_matrix, labels=convo_ids, leaf_rotation=90)\n", + " plt.xlabel(\"Conversation ID\")\n", + " plt.ylabel(\"Distance\")\n", + " plt.show()\n", + "\n", + " # for i, cluster in enumerate(cluster_lists):\n", + " # print(f\"Cluster {i+1}: {cluster}\")\n", + " \n", + " return linkage_matrix, cluster_lists\n", + "\n", + "exclude = set(string.punctuation)\n", + "\n", + "# from https://github.com/jmhessel/FightingWords/blob/master/fighting_words_py3.py\n", + "def basic_sanitize(in_string):\n", + " '''Returns a very roughly sanitized version of the input string.'''\n", + " in_string = ''.join([ch for ch in in_string if ch not in exclude])\n", + " in_string = in_string.lower()\n", + " in_string = ' '.join(in_string.split())\n", + " return in_string\n", + "\n", + "def bayes_compare_language(l1, l2, ngram = 1, prior=.01, cv = None):\n", + " '''\n", + " Arguments:\n", + " - l1, l2; a list of strings from each language sample\n", + " - ngram; an int describing up to what n gram you want to consider (1 is unigrams,\n", + " 2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.\n", + " - prior; either a float describing a uniform prior, or a vector describing a prior\n", + " over vocabulary items. If you're using a predefined vocabulary, make sure to specify that\n", + " when you make your CountVectorizer object.\n", + " - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.\n", + "\n", + " Returns:\n", + " - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''\n", + " if cv is None and type(prior) is not float:\n", + " print(\"If using a non-uniform prior:\")\n", + " print(\"Please also pass a count vectorizer with the vocabulary parameter set.\")\n", + " quit()\n", + " l1 = [basic_sanitize(l) for l in l1]\n", + " l2 = [basic_sanitize(l) for l in l2]\n", + " if cv is None:\n", + " cv = CV(decode_error = 'ignore', min_df=2, max_df=0.9, ngram_range=(1,ngram),\n", + " binary = False,\n", + " max_features = 15000)\n", + " counts_mat = cv.fit_transform(l1+l2).toarray()\n", + " # Now sum over languages...\n", + " vocab_size = len(cv.vocabulary_)\n", + " print(\"Vocab size is {}\".format(vocab_size))\n", + " if type(prior) is float:\n", + " priors = np.array([prior for i in range(vocab_size)])\n", + " else:\n", + " priors = prior\n", + " z_scores = np.empty(priors.shape[0])\n", + " count_matrix = np.empty([2, vocab_size], dtype=np.float32)\n", + " count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)\n", + " count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)\n", + " a0 = np.sum(priors)\n", + " n1 = 1.*np.sum(count_matrix[0,:])\n", + " n2 = 1.*np.sum(count_matrix[1,:])\n", + " print(\"Comparing language...\")\n", + " for i in range(vocab_size):\n", + " #compute delta\n", + " term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))\n", + " term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))\n", + " delta = term1 - term2\n", + " #compute variance on delta\n", + " var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])\n", + " #store final score\n", + " z_scores[i] = delta/np.sqrt(var)\n", + " index_to_term = {v:k for k,v in cv.vocabulary_.items()}\n", + " sorted_indices = np.argsort(z_scores)\n", + " return_list = []\n", + " for i in sorted_indices:\n", + " return_list.append((index_to_term[i], z_scores[i]))\n", + " return return_list\n", + "\n", + "def format_bullet_points(bullets):\n", + " text = \"\"\n", + " for idx, bullet in bullets.items():\n", + " text += f\"{idx} : {bullet}\\n\"\n", + " return text\n", + "\n", + "def get_machine_scd_and_bullets(convo_id, scd_dict=summaries_and_bullets):\n", + " summary = scd_dict[convo_id]['summary']\n", + " bullet = format_bullet_points(scd_dict[convo_id]['bulletpoint'])\n", + " return summary, bullet\n", + "\n", + "def get_fighting_words_convincers(cluster1, cluster2):\n", + " for lst in [cluster1, cluster2]:\n", + " for convo_id in lst:\n", + " summary, bullets = get_machine_scd_and_bullets(convo_id)\n", + " convo = corpus.get_conversation(convo_id)\n", + " convo.meta['new_machine_scd'] = summary\n", + " convo.meta['machine_bullet_points'] = bullets\n", + " sentences = sent_tokenize(summary)\n", + " convincer_sentences = [s for s in sentences if 'CONVINCER' in s]\n", + " convo.meta['new_convincer_text_machine_scd'] = \" \".join(convincer_sentences)\n", + " convo.meta['dict_bullet_points'] = dict(line.split(' : ', 1) for line in bullets.strip().split('\\n'))\n", + " convincer_bullets = \", \".join([s for _, s in convo.meta['dict_bullet_points'].items() if 'convincer' in s.lower()])\n", + " convo.meta['convincer_bullet_points'] = convincer_bullets\n", + " \n", + " cluster1_convincer_scds = []\n", + " cluster2_convincer_scds = []\n", + " for convo_id in cluster1:\n", + " convo = corpus.get_conversation(convo_id)\n", + " cluster1_convincer_scds.append(convo.meta['convincer_bullet_points'])\n", + " for convo_id in cluster2:\n", + " convo = corpus.get_conversation(convo_id)\n", + " cluster2_convincer_scds.append(convo.meta['convincer_bullet_points'])\n", + " \n", + " z_scores = bayes_compare_language(cluster1_convincer_scds, cluster2_convincer_scds, ngram = 3) \n", + " top_k = 10\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " print(f\"Fighting Words Comments between:\")\n", + " print(\"Cluster1: \", top_k_class1)\n", + " print(\"Cluster2: \", top_k_class2)\n", + "\n", + "def get_fighting_words_matching_bullets(cluster1, cluster2, similarity_result=similarity_result, summaries_and_bullets=summaries_and_bullets):\n", + " cluster1_combo = list(combinations(cluster1, 2))\n", + " matched_cluster1 = []\n", + " for convo_id1, convo_id2 in cluster1_combo:\n", + " key = f\"{convo_id1}_{convo_id2}\" if f\"{convo_id1}_{convo_id2}\" in similarity_result.keys() else f\"{convo_id2}_{convo_id1}\"\n", + " for k, result in enumerate(similarity_result[key]['result']):\n", + " for index in result:\n", + " if result[index]['score'] > 0.5:\n", + " if k == 0:\n", + " matched_cluster1.append(summaries_and_bullets[convo_id1]['bulletpoint'][index])\n", + " else:\n", + " try:\n", + " matched_cluster1.append(summaries_and_bullets[convo_id2]['bulletpoint'][index])\n", + " except Exception:\n", + " continue\n", + " \n", + " cluster2_combo = list(combinations(cluster2, 2))\n", + " matched_cluster2 = []\n", + " for convo_id1, convo_id2 in cluster2_combo:\n", + " key = f\"{convo_id1}_{convo_id2}\" if f\"{convo_id1}_{convo_id2}\" in similarity_result.keys() else f\"{convo_id2}_{convo_id1}\"\n", + " for k, result in enumerate(similarity_result[key]['result']):\n", + " for index in result:\n", + " if result[index]['score'] > 0.5:\n", + " if k == 0:\n", + " matched_cluster2.append(summaries_and_bullets[convo_id1]['bulletpoint'][index])\n", + " else:\n", + " matched_cluster2.append(summaries_and_bullets[convo_id2]['bulletpoint'][index])\n", + " \n", + " z_scores = bayes_compare_language(matched_cluster1, matched_cluster2, ngram = 3) \n", + " top_k = 15\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " print(f\"Fighting Words Comments between:\")\n", + " print(\"Cluster1: \", top_k_class1)\n", + " print(\"Cluster2: \", top_k_class2)\n", + " return matched_cluster1, matched_cluster2\n", + "\n", + "\n", + "def find_keywords_in_lst_of_text(lst, keyword):\n", + " result = []\n", + " for text in lst:\n", + " if keyword.lower() in text.lower():\n", + " result.append(text)\n", + " return result\n", + " \n", + "\n", + "def get_clusters_delta_percentage(lst_of_clusters):\n", + " for i, g in enumerate(lst_of_clusters):\n", + " temp = []\n", + " for idx in g:\n", + " if corpus.get_conversation(idx).meta['has_delta']:\n", + " temp.append(\"delta\")\n", + " else:\n", + " temp.append(\"no\")\n", + " print(f\"Cluster {i+1} (N={len(g)}): {temp.count('delta') / len(temp)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db126689-6872-4231-b36d-3cdbb1f5217f", + "metadata": {}, + "outputs": [], + "source": [ + "### Cluster conversations, and we notice one cluster contains much more delta convos than the other, percentage wise ###\n", + "_, c_lst = cluster_conversations(convo_2018, threshold=2.5)\n", + "get_clusters_delta_percentage(c_lst)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2be584fa-fd05-486c-bd13-88e2b37d6a66", + "metadata": {}, + "outputs": [], + "source": [ + "print(c_lst[0])\n", + "print(len(c_lst[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcf87e9b-905b-4484-b481-b7c37d800933", + "metadata": {}, + "outputs": [], + "source": [ + "print(c_lst[1])\n", + "print(len(c_lst[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f77c2ba-4094-4be3-8cce-b7da2832d5a9", + "metadata": {}, + "outputs": [], + "source": [ + "random_2018_less_delta_set = c_lst[0]\n", + "random_2018_more_delta_set = c_lst[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a394c419-915a-4e5b-a856-846c1c256277", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + \"random_2018_less_delta_set.json\", \"w\") as f:\n", + " json.dump(random_2018_less_delta_set, f, indent=4)\n", + "\n", + "with open(DATA_PATH + \"random_2018_more_delta_set.json\", \"w\") as f:\n", + " json.dump(random_2018_more_delta_set, f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5b94e79", + "metadata": {}, + "outputs": [], + "source": [ + "b1, b2 = get_fighting_words_matching_bullets(random_2018_less_delta_set, random_2018_more_delta_set)" + ] + }, + { + "cell_type": "markdown", + "id": "3a806364-e821-4b3a-b191-ac2e345e8ae5", + "metadata": {}, + "source": [ + "## Selecting Delta Conversations\n", + "\n", + "Next, we sample a set of Δ (persuasive) and ¬Δ (non-persuasive) conversations and compare their similarity to the previously clustered groups, examining how ConDynS captures alignment between known persuasive dynamics and automatically discovered clusters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cea50d9a-a574-4db8-a356-ea993dbaf0a2", + "metadata": {}, + "outputs": [], + "source": [ + "### First, we need to download the annotated CMV corpus ###\n", + "corpus = Corpus(filename=DATA_PATH + \"annotated_cmv_delta\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8467ff1-4cda-450b-b06e-06dad6ca7b3a", + "metadata": {}, + "outputs": [], + "source": [ + "### Helper functions ###\n", + "def is_delta_convo(convo):\n", + " return convo.meta['has_delta']\n", + "def check_only_one_delta(convo1, convo2):\n", + " return convo1.meta['has_delta'] + convo2.meta['has_delta'] == 1\n", + "def check_both_convo_valid(convo1, convo2, convo_2018=convo_2018):\n", + " utt_lst1 = [utt for utt in convo1.iter_utterances()]\n", + " utt_lst2 = [utt for utt in convo2.iter_utterances()]\n", + " return is_valid_convo(convo1, utt_lst1) and is_valid_convo(convo2, utt_lst2) and extract_real_id(convo1.id) not in convo_2018 and extract_real_id(convo2.id) not in convo_2018\n", + "def extract_real_id(idx):\n", + " return idx.split(\"_\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45867eea-bb6f-45ab-ac40-a718b32ae03a", + "metadata": {}, + "outputs": [], + "source": [ + "### Filtering for delta and non-delta convos in the year of 2018 ###\n", + "valid_convos = {\"delta\" : [], \"no_delta\" : []}\n", + "\n", + "for convo in tqdm(corpus.iter_conversations()):\n", + " if convo.id in valid_convos['delta'] or convo.id in valid_convos['no_delta']:\n", + " continue\n", + " pair = corpus.get_conversation(convo.meta['pair_id'])\n", + " assert check_only_one_delta(convo, pair), \"convo and its pair should have only one delta.\"\n", + " if check_both_convo_valid(convo, pair):\n", + " if get_convo_year([utt for utt in convo.iter_utterances()]) == 2018 and get_convo_year([utt for utt in pair.iter_utterances()]) == 2018:\n", + " if convo.meta['has_delta']:\n", + " assert not pair.meta['has_delta']\n", + " valid_convos['delta'].append(convo.id)\n", + " valid_convos['no_delta'].append(pair.id)\n", + " else:\n", + " assert not convo.meta['has_delta']\n", + " valid_convos['delta'].append(pair.id)\n", + " valid_convos['no_delta'].append(convo.id)\n", + "\n", + "print(\"Number of delta convos: \", len(valid_convos['delta']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a3a50b-0bd3-47eb-aa38-4be1aaf1d22a", + "metadata": {}, + "outputs": [], + "source": [ + "### Pairing up delta and non-delta convos ###\n", + "valid_convo_pairs = [(valid_convos['delta'][i], valid_convos['no_delta'][i]) for i in range(len(valid_convos['delta']))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b98bb7e3-d317-45f5-a338-cd0cb11b2136", + "metadata": {}, + "outputs": [], + "source": [ + "### Select pairs of delta and non-delta convos, with unique speakers We always filter a little more to deal with the case when LLM can't handle the input ###\n", + "random.seed(4300)\n", + "K = 250\n", + "used_speakers = []\n", + "selected_convos = []\n", + "random.shuffle(valid_convo_pairs)\n", + "for convo1, convo2 in tqdm(valid_convo_pairs):\n", + " convo_1, convo_2 = corpus.get_conversation(convo1), corpus.get_conversation(convo2)\n", + " utt_lst1, utt_lst2 = [utt for utt in convo_1.iter_utterances()], [utt for utt in convo_2.iter_utterances()]\n", + " convo_speakers = list(set(get_all_speakers(utt_lst1) + get_all_speakers(utt_lst2)))\n", + " \n", + " if all(s not in used_speakers for s in convo_speakers):\n", + " selected_convos.append(convo1)\n", + " selected_convos.append(convo2)\n", + " used_speakers.extend(convo_speakers)\n", + " \n", + " if len(selected_convos) == K:\n", + " break\n", + "\n", + "len(selected_convos)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a635d786-984b-4be8-89f8-8944dd261033", + "metadata": {}, + "outputs": [], + "source": [ + "### Constructing the corpus with selected convos for the experiment ###\n", + "all_utterances = []\n", + "for convo_id in selected_convos:\n", + " convo = corpus.get_conversation(convo_id)\n", + " utt_lst = [utt for utt in convo.iter_utterances()]\n", + " all_utterances.extend(utt_lst)\n", + "\n", + "selected_corpus = Corpus(utterances=all_utterances)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b5469b4-42aa-490e-b3bf-6929d610784b", + "metadata": {}, + "outputs": [], + "source": [ + "### Annotating the corpus ###\n", + "for convo in tqdm(selected_corpus.iter_conversations()):\n", + " og_convo = corpus.get_conversation(convo.id)\n", + " convo.meta = og_convo.meta\n", + " convo.meta['year'] = '2018'\n", + "\n", + "for sp in tqdm(selected_corpus.iter_speakers()):\n", + " og_sp = corpus.get_speaker(sp.id)\n", + " sp.meta = og_sp.meta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba39e136-b3df-49dd-86f2-a4d300c709e6", + "metadata": {}, + "outputs": [], + "source": [ + "selected_corpus.dump(\"cmv_selected_delta_2018\", base_path=DATA_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "267e3753-8ffe-426f-af45-235544a823c7", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + \"cmv_selected_delta_convo_ids_2018.json\", \"w\") as f:\n", + " json.dump(selected_convos, f, indent=4)" + ] + }, + { + "cell_type": "markdown", + "id": "72b35bc8-a6a0-4d7a-9a92-2c54c8600987", + "metadata": {}, + "source": [ + "## ConDynS Computation - Delta 2018 Set\n", + "\n", + "We compute ConDynS similarity scores within the Δ (persuasive) and ¬Δ (non-persuasive) sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "103cf5b3-8af4-4246-8745-d400102b8547", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_PATH = \"./artefacts\"\n", + "CUR_ANALYSIS = \"delta_set_2018\"\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}\")\n", + " os.makedirs(f\"{BASE_PATH}\")\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}/{CUR_ANALYSIS}/\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}/{CUR_ANALYSIS}/\")\n", + " os.makedirs(f\"{BASE_PATH}/{CUR_ANALYSIS}/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c71671ed-46e4-404c-8d44-4e4aa88f4536", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + \"cmv_selected_delta_convo_ids_2018.json\", \"r\") as f:\n", + " delta_2018 = json.load(f)\n", + "\n", + "len(delta_2018)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "303070cf-1176-4186-a832-0bb3a1aac9cb", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = Corpus(filename=DATA_PATH + \"cmv_selected_delta_2018\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dafa5df-ef8c-4a68-8ab8-7ec546f44253", + "metadata": {}, + "outputs": [], + "source": [ + "### Check the distribution of the number of utterances in the corpus ###\n", + "from collections import Counter\n", + "numbers = [len(convo.get_utterance_ids()) for convo in corpus.iter_conversations()]\n", + "counts = Counter(numbers)\n", + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "208a9171-854f-4da6-b4e3-457583149a41", + "metadata": {}, + "outputs": [], + "source": [ + "### Computing the similarity between all conversations in the same group for delta 2018 selected convos ###\n", + "all_convos = delta_2018\n", + "data_path = f\"{BASE_PATH}/{CUR_ANALYSIS}/\"\n", + "\n", + "if os.path.exists(f\"{data_path}summary.json\"):\n", + " with open(f\"{data_path}summary.json\", \"r\") as file:\n", + " summaries_and_bullets = json.load(file)\n", + "else:\n", + " summaries_and_bullets = {}\n", + "\n", + "if os.path.exists(f\"{data_path}similarity.json\"):\n", + " with open(f\"{data_path}similarity.json\", \"r\") as file:\n", + " similarity_result = json.load(file)\n", + "else:\n", + " similarity_result = {}\n", + "\n", + "incomplete = []\n", + "\n", + "summaries_and_bullets, similarity_result, incomplete = compute_within_group_similarity(all_convos, summaries_and_bullets, similarity_result, incomplete, data_path)\n", + "\n", + "print(\"Complete!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c66c5df6-52be-48e0-a76b-2e7b99a8796b", + "metadata": {}, + "outputs": [], + "source": [ + "### Check completed computation ###\n", + "after_incomplete = [idx for idx in delta_2018 if idx not in problem_convos]\n", + "delta_set = [convo_id for convo_id in after_incomplete if corpus.get_conversation(convo_id).meta['has_delta']]\n", + "no_delta_set = [convo_id for convo_id in after_incomplete if convo_id not in delta_set]\n", + "print(len(delta_set), len(no_delta_set))\n", + "random.seed(4300)\n", + "delta_set = random.sample(delta_set, 100)\n", + "no_delta_set = random.sample(no_delta_set, 100)\n", + "delta_2018 = delta_set + no_delta_set\n", + "print(len(delta_2018))" + ] + }, + { + "cell_type": "markdown", + "id": "76c8206e-305c-4796-8c87-3756f92d6371", + "metadata": {}, + "source": [ + "# Compute Inter Group Similarity\n", + "\n", + "We use ConDynS to measure inter-group similarity between the random conversation set and the delta groups. This shows how similar the dynamics of random conversations are to those of persuasive ones, helping us understand what the earlier clusters actually capture.\n", + "\n", + "Here, we also demonstrate the way to set custom SCD prompts for adapting our module to your own data!\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79fa9d55-5191-411e-81e9-1e7455e2594f", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_PATH = \"./artefacts\"\n", + "CUR_ANALYSIS = \"inter_group_sim\"\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}\")\n", + " os.makedirs(f\"{BASE_PATH}\")\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}/{CUR_ANALYSIS}/\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}/{CUR_ANALYSIS}/\")\n", + " os.makedirs(f\"{BASE_PATH}/{CUR_ANALYSIS}/\")\n", + "\n", + "with open(\"cmv_selected_delta_convo_ids_2018.json\", \"r\") as f:\n", + " delta_2018 = json.load(f)\n", + "\n", + "with open(\"cmv_selected_convo_ids_2018.json\", \"r\") as f:\n", + " convo_2018 = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c91e49f-04eb-470e-8240-f1fbb1b3b198", + "metadata": {}, + "outputs": [], + "source": [ + "corpus_delta = Corpus(filename=DATA_PATH + \"cmv_selected_delta_2018\")\n", + "corpus_2018 = Corpus(filename=DATA_PATH + \"cmv_selected_convos_2018\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d653fcc4-2c18-4496-b0bb-38ecf44dfc87", + "metadata": {}, + "outputs": [], + "source": [ + "delta_set = [convo_id for convo_id in delta_2018 if corpus_delta.get_conversation(convo_id).meta['has_delta']]\n", + "no_delta_set = [convo_id for convo_id in delta_2018 if convo_id not in delta_set]\n", + "assert len(delta_set) == len(no_delta_set) == 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f10b485b-6bd6-4a1b-8458-5c51aab80cce", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + \"random_2018_less_delta_set.json\", \"r\") as f:\n", + " random_2018_less_delta_set = json.load(f)\n", + "\n", + "with open(DATA_PATH + \"random_2018_more_delta_set.json\", \"r\") as f:\n", + " random_2018_more_delta_set = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "855f5289", + "metadata": {}, + "outputs": [], + "source": [ + "### Modify to SCD prompt to focus on Persuasions ###\n", + "persuasion_scd_prompt = \"\"\"Write a short summary capturing the trajectory of an online conversation. \n", + "Do not include specific topics, claims, or arguments from the conversation. The style you should avoid: \n", + "Example Sentence 1: \"Speaker1, who is Asian, defended Asians and pointed out that a study found that whites, Hispanics, and blacks were accepted into universities in that order, with Asians being accepted the least. Speaker2 acknowledged that Asians have high household income, but argued that this could be a plausible explanation for the study's findings. Speaker1 disagreed and stated that the study did not take wealth into consideration.\" \n", + "This style mentions specific claims and topics, which are not needed.\n", + "\n", + "Instead, do include indicators of sentiments (e.g., sarcasm, passive-aggressive, polite, frustration, attack, blame), individual intentions (e.g., agreement, disagreement, persistent-agreement, persistent-disagreement, rebuttal, defense, concession, confusion, clarification, neutral, accusation) and conversational strategies (if any) such as 'rhetorical questions', 'straw man fallacy', 'identify fallacies', and 'appealing to emotions.' \n", + "The following sentences demonstrate the style you should follow:\n", + "\n", + "Example Sentence 2: \"Both speakers have differing opinions and appeared defensive. Speaker1 attacks Speaker2 by diminishing the importance of his argument and Speaker2 blames Speaker1 for using profane words. Both speakers accuse each other of being overly judgemental of their personal qualities rather than arguments.\"\n", + "\n", + "Example Sentence 3: \"The two speakers refuted each other with back and forth accusations. Throughout the conversation, they kept harshly fault-finding with overly critical viewpoints, creating an intense and inefficient discussion.\"\n", + "\n", + "Example Sentence 4: \"Speaker1 attacks Speaker2 by questioning the relevance of his premise and Speaker2 blames Speaker1 for using profane words. Both speakers accuse each other of being overly judgemental of their personal qualities rather than arguments.\"\n", + "\n", + "Overall, the trajectory summary should capture the key moments where the tension of the conversation notably changes. Here is an example of a complete trajectory summary. \n", + "\n", + "Trajectory Summary: \n", + "Multiple users discuss minimum wage. Four speakers express their different points of view subsequently, building off of each other's arguments. Speaker1 disagrees with a specific point from Speaker2's argument, triggering Speaker2 to contradict Speaker1 in response. Then, Speaker3 jumps into the conversation to support Speaker1's argument, which leads Speaker2 to adamantly defend their argument. Speaker2 then quotes a deleted comment, giving an extensive counterargument. The overall tone remains civil.\n", + "\n", + "Now, provide the trajectory summary for the following conversation.\n", + "Conversation Transcript:\n", + "{formatted_object}\n", + "\n", + "Now, summarize this conversation. Remember, do not include specific topics, claims, or arguments from the conversation. Instead, try to capture the speakers' sentiments, intentions, and conversational/persuasive strategies. Limit the trajectory summary to 80 words. \n", + "\n", + "Trajectory Summary:\n", + "\"\"\"\n", + "\n", + "persuasion_sop_prompt = \"\"\"\n", + "Here is a trajectory summary of a conversation that lays out how the dynamics of the conversation developed. You need to parse the summary into events in order. \n", + "Follow the following guidelines:\n", + "1. Try to maintain the original language of the summary as much as you can. \n", + "2. Provide your output as a Python dictionary with the following structure:\n", + "_(Note: Do NOT use markdown, JSON formatting, or code block delimiters.)_ \n", + "{{\n", + " '0': \"\" // description of the event\n", + " '1': ...\n", + " ...\n", + "}}\n", + "Here is the summary:\n", + "{formatted_object}\n", + "\"\"\"\n", + "\n", + "persuasion_scd_transformer = SCD(\n", + " model_provider=MODEL_PROVIDER,\n", + " model=MODEL,\n", + " config=config,\n", + " custom_scd_prompt=persuasion_scd_prompt,\n", + " custom_sop_prompt=persuasion_sop_prompt,\n", + " generate_scd=True,\n", + " generate_sop=True,\n", + " custom_prompt_dir=\"./persuasion_prompts/\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e82c534-03b3-4c7f-b8f4-aca1d6faadcf", + "metadata": {}, + "outputs": [], + "source": [ + "### Update helper functions with persuasion prompt ###\n", + "def get_cur_convo_transcript_persuasion(corpus, convo_id):\n", + " convo = corpus.get_conversation(convo_id)\n", + " utt_list = convo.get_chronological_utterance_list()\n", + " transcription = []\n", + " spk_list = {utt_list[0].speaker.id : \"SPEAKER1\"}\n", + " for utt in utt_list:\n", + " if utt.speaker.id not in spk_list.keys():\n", + " spk_list[utt.speaker.id] = \"SPEAKER2\"\n", + " assert len(spk_list) == 2\n", + " transcription.append(spk_list[utt.speaker.id] +\": \"+utt.text)\n", + " transcription = transcription[1:] # remove first OP message\n", + " return transcription\n", + "\n", + "\n", + "def compute_within_group_similarity_persuasion(all_convos, summaries_and_bullets, similarity_result, incomplete, data_path):\n", + " all_convos_combos = list(combinations(all_convos, 2))\n", + " len(all_convos_combos)\n", + " \n", + " # ### Calling GPT to create the SCD and bullet points. Run with caution.\n", + " summaries_and_bullets = summaries_and_bullets\n", + " incomplete = incomplete\n", + " similarity_result = similarity_result\n", + " \n", + " # Generate SCDs and SoPs for conversations that don't have them yet\n", + " convos_to_process = [convo_id for convo_id in all_convos if convo_id not in summaries_and_bullets.keys()]\n", + " \n", + " if convos_to_process:\n", + " # Process conversations from both corpora\n", + " convos_2018 = [cid for cid in convos_to_process if cid in convo_2018]\n", + " convos_delta = [cid for cid in convos_to_process if cid in delta_2018]\n", + " \n", + " if convos_2018:\n", + " def convo_selector_2018(conversation):\n", + " return conversation.id in convos_2018\n", + " print(f\"Generating SCDs for {len(convos_2018)} conversations from 2018 corpus...\")\n", + " persuasion_scd_transformer.transform(corpus_2018, selector=convo_selector_2018)\n", + " \n", + " if convos_delta:\n", + " def convo_selector_delta(conversation):\n", + " return conversation.id in convos_delta\n", + " print(f\"Generating SCDs for {len(convos_delta)} conversations from delta corpus...\")\n", + " persuasion_scd_transformer.transform(corpus_delta, selector=convo_selector_delta)\n", + " \n", + " # Extract results from conversation metadata\n", + " for convo_id in convos_to_process:\n", + " if convo_id in convo_2018:\n", + " convo = corpus_2018.get_conversation(convo_id)\n", + " else:\n", + " convo = corpus_delta.get_conversation(convo_id)\n", + " summary = convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoint = convo.meta.get(\"machine_sop\", \"\")\n", + " summaries_and_bullets.update({convo_id : {\"summary\" : summary, \"bulletpoint\" : bulletpoint}})\n", + " with open(f\"{data_path}summary.json\", \"w\") as file:\n", + " json.dump(summaries_and_bullets, file, indent=4)\n", + "\n", + " def get_bidirection_similarity_with_retry_2(corpus_2018, corpus_delta, convo1_id, convo2_id, summaries_and_bullets, retries=10):\n", + " for i in range(retries):\n", + " try:\n", + " # Determine which corpus each conversation belongs to\n", + " if convo1_id in convo_2018:\n", + " corpus1 = corpus_2018\n", + " else:\n", + " corpus1 = corpus_delta\n", + " \n", + " if convo2_id in convo_2018:\n", + " corpus2 = corpus_2018\n", + " else:\n", + " corpus2 = corpus_delta\n", + " \n", + " # If both conversations are in the same corpus, use compare_conversations\n", + " if corpus1 == corpus2:\n", + " result, score = condyns.compare_conversations(\n", + " corpus1, convo1_id, convo2_id, \n", + " sop_meta_name=\"machine_sop\"\n", + " )\n", + " else:\n", + " # For cross-corpus comparison, use the lower-level API\n", + " convo1 = corpus1.get_conversation(convo1_id)\n", + " convo2 = corpus2.get_conversation(convo2_id)\n", + " \n", + " # Format transcripts\n", + " transcript1 = get_cur_convo_transcript_persuasion(corpus1, convo1_id)\n", + " transcript2 = get_cur_convo_transcript_persuasion(corpus2, convo2_id)\n", + " \n", + " sop1 = convo1.meta.get(\"machine_sop\", \"\")\n", + " sop2 = convo2.meta.get(\"machine_sop\", \"\")\n", + " \n", + " result = condyns.compute_bidirectional_similarity(\n", + " \"\\n\\n\".join(transcript1), \"\\n\\n\".join(transcript2), sop1, sop2\n", + " )\n", + " score = np.mean(condyns.compute_score_from_results(result))\n", + " \n", + " return score, result\n", + " except Exception as e:\n", + " wait = 0.5 ** i + random.random()\n", + " print(f\"Retrying ({convo1_id}, {convo2_id}) after {wait:.2f}s due to error: {e}\")\n", + " incomplete.append(f'{convo1_id}_{convo2_id}')\n", + " time.sleep(wait)\n", + " return None, None\n", + " \n", + " # The thread worker function\n", + " def worker(corpus_2018, corpus_delta, convo1_id, convo2_id, summaries_and_bullets):\n", + " score, result = get_bidirection_similarity_with_retry_2(corpus_2018, corpus_delta, convo1_id, convo2_id, summaries_and_bullets)\n", + " return (f'{convo1_id}_{convo2_id}', {\"score\": score, \"result\": result})\n", + "\n", + " MAX_WORKER = 50\n", + " \n", + " # Build task list\n", + " tasks = [(corpus_2018, corpus_delta, id1, id2, summaries_and_bullets) \n", + " for id1, id2 in all_convos_combos \n", + " if f'{id1}_{id2}' not in similarity_result and f'{id1}_{id2}' not in incomplete and f'{id2}_{id1}' not in similarity_result and f'{id2}_{id1}' not in incomplete]\n", + " \n", + " # Set up thread pool\n", + " with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", + " futures = [executor.submit(worker, *task) for task in tasks]\n", + " \n", + " for future in tqdm(as_completed(futures), total=len(futures), desc=\"Computing Pair-wise Similarity\"):\n", + " key, value = future.result()\n", + " if key in similarity_result.keys():\n", + " print(\"not good, repeated keys\")\n", + " if key not in incomplete:\n", + " similarity_result[key] = value\n", + "\n", + " with open(f\"{data_path}similarity.json\", \"w\") as file:\n", + " json.dump(similarity_result, file, indent=4)\n", + "\n", + " with open(f\"{data_path}incomplete.json\", \"w\") as file:\n", + " json.dump(incomplete, file, indent=4)\n", + "\n", + " return summaries_and_bullets, similarity_result, incomplete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9efafe5b-a454-425d-9762-0f79ee8c7dc0", + "metadata": {}, + "outputs": [], + "source": [ + "all_convos = convo_2018 + delta_2018\n", + "data_path = f\"{BASE_PATH}/{CUR_ANALYSIS}/\"\n", + "\n", + "if os.path.exists(f\"{data_path}summary.json\"):\n", + " with open(f\"{data_path}summary.json\", \"r\") as file:\n", + " summaries_and_bullets = json.load(file)\n", + "else:\n", + " summaries_and_bullets = {}\n", + "\n", + "if os.path.exists(f\"{data_path}similarity.json\"):\n", + " with open(f\"{data_path}similarity.json\", \"r\") as file:\n", + " similarity_result = json.load(file)\n", + "else:\n", + " similarity_result = {}\n", + "\n", + "incomplete = []\n", + "\n", + "summaries_and_bullets, similarity_result, incomplete = compute_within_group_similarity_persuasion(all_convos, summaries_and_bullets, similarity_result, incomplete, data_path)\n", + "\n", + "print(\"Complete!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93a25057-ccb4-4ee3-9bd8-03b5dcdf9445", + "metadata": {}, + "outputs": [], + "source": [ + "### Verify all similarity are computed valid here:\n", + "with open(f\"{data_path}summary.json\", \"r\") as file:\n", + " summaries_and_bullets = json.load(file)\n", + "for convo_id in all_convos:\n", + " assert convo_id in summaries_and_bullets\n", + "\n", + "with open(f\"{data_path}similarity.json\", \"r\") as file:\n", + " similarity_result = json.load(file)\n", + "all_convos_combos = list(combinations(convo_2018, 2))\n", + "for id1, id2 in all_convos_combos:\n", + " assert f'{id1}_{id2}' in similarity_result or f'{id2}_{id1}' in similarity_result" + ] + }, + { + "cell_type": "markdown", + "id": "44a5e4c9-430e-4f7d-aa3e-ce9da189012a", + "metadata": {}, + "source": [ + "### Analysis - Inter/Intra group similarity" + ] + }, + { + "cell_type": "markdown", + "id": "f0e9159e-c470-4586-b3cc-acd29e9d8d39", + "metadata": {}, + "source": [ + "#### Inter Group Similarity\n", + "\n", + "Here we present the between group similarities from the random set with delta/no delta sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf7b6a8-b17e-4bb2-99a4-aa57073ed9d9", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_between_group_similarity(group1, group2):\n", + " similarities = []\n", + " for convo1 in group1:\n", + " for convo2 in group2:\n", + " sim = np.mean(get_similarity(convo1, convo2, all_convos, similarity_result))\n", + " similarities.append(sim)\n", + " return np.array(similarities)\n", + " \n", + "\n", + "def between_group_similarity_check(group1, group2, group1_name, group2_name, y_lim=4):\n", + " between_similarities = compute_between_group_similarity(group1, group2)\n", + " \n", + " between_mean = np.mean(between_similarities)\n", + " \n", + " plt.figure(figsize=(6, 4))\n", + " plt.hist(between_similarities, bins=20, alpha=0.7, color='purple', label=f'{group1_name} vs {group2_name}', density=False)\n", + " \n", + " plt.xlabel(\"similarity score\")\n", + " plt.ylabel(\"number of conversation pairs\")\n", + " plt.title(f\"Between-Group Similarity: {group1_name} vs {group2_name}\")\n", + " plt.legend()\n", + " plt.grid(True, linestyle='--', alpha=0.6)\n", + " plt.xlim(0, 1)\n", + " plt.ylim(0, y_lim)\n", + " \n", + " plt.show()\n", + " \n", + " print(f\"Between-Group Mean Similarity ({group1_name} vs {group2_name}): {between_mean:.4f}\")\n", + "\n", + " return between_similarities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23efd4e8-2e5d-4bb7-b10f-348e832dc996", + "metadata": {}, + "outputs": [], + "source": [ + "group1_delta = between_group_similarity_check(random_2018_less_delta_set, delta_set, \"group1\", \"delta\", y_lim=2000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1397b56a-780d-4e06-8523-b34d8414f798", + "metadata": {}, + "outputs": [], + "source": [ + "group1_delta = between_group_similarity_check(random_2018_more_delta_set, delta_set, \"group2\", \"delta\", y_lim=2000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f42a4a73-70fe-4a19-8879-050188009818", + "metadata": {}, + "outputs": [], + "source": [ + "group1_delta = between_group_similarity_check(random_2018_less_delta_set, no_delta_set, \"group1\", \"no delta\", y_lim=2000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88e37ce2-7d40-4c5b-8440-8c0d84371d02", + "metadata": {}, + "outputs": [], + "source": [ + "group1_delta = between_group_similarity_check(random_2018_more_delta_set, no_delta_set, \"group2\", \"no delta\", y_lim=2000)" + ] + }, + { + "cell_type": "markdown", + "id": "ab2f08dc-ebb1-40c7-827c-0d3d87ee27e0", + "metadata": {}, + "source": [ + "#### Intra Group Similarity\n", + "\n", + "Here we present the Intra group similarity within delta set and within no delta set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa4f5f6-a64d-4c39-b080-4009c1899fa0", + "metadata": {}, + "outputs": [], + "source": [ + "two_groups_intra_group_similarity_check(delta_set, no_delta_set, \"Persuasive\", \"Non-persuasive\", all_convos, similarity_result, y_lim = 700)" + ] + }, + { + "cell_type": "markdown", + "id": "f48fb214-6974-471c-bfa0-9086d10ea7c4", + "metadata": {}, + "source": [ + "# New Insights: Who Drives The Dynamics?\n", + "\n", + "We investigate which speaker’s tendencies shape conversational dynamics by comparing conversations where the same individual appears as the original poster (OP) versus as the challenger. Using ConDynS, we quantify the similarity of dynamics across these role-specific conversations to assess whether situational power (held by the OP) or persuasive strategy (driven by the challenger) has a stronger influence on the interaction’s trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48676c84-1dd4-4c67-a3ba-136b6df97f20", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = Corpus(filename=DATA_PATH + \"subreddit-changemyview\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa4f75d8-09a7-4d5c-9121-5de21d82c212", + "metadata": {}, + "outputs": [], + "source": [ + "### Annotate Delta\n", + "for utt in tqdm(corpus.iter_utterances()):\n", + " if (\n", + " utt.reply_to is not None\n", + " and utt.speaker.id == \"DeltaBot\"\n", + " and \"delta awarded\" in utt.text\n", + " ):\n", + " deltabot_text = utt.text\n", + " match = re.search(\n", + " r\"(?:Confirmed: 1 delta awarded to )(?:\\/)?(?:u\\/)([\\w-]+)\", deltabot_text\n", + " )\n", + " if match is not None:\n", + " try:\n", + " delta_utt = corpus.get_utterance(utt.reply_to)\n", + " delta_utt.meta['got_delta'] = True\n", + " except KeyError:\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b58b3dc1-2057-4396-8a93-a62378a47d77", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_PATH = \"./artefacts\"\n", + "CUR_ANALYSIS = \"speaker_consistency_same_op_contender\"\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}\")\n", + " os.makedirs(f\"{BASE_PATH}\")\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}/{CUR_ANALYSIS}/\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}/{CUR_ANALYSIS}/\")\n", + " os.makedirs(f\"{BASE_PATH}/{CUR_ANALYSIS}/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6667e91b-cf8c-42fb-8786-14315d8048c1", + "metadata": {}, + "outputs": [], + "source": [ + "def get_replier_id(utt_lst):\n", + " return utt_lst[1].speaker.id\n", + "\n", + "def get_op_id(utt_lst):\n", + " return utt_lst[0].speaker.id\n", + "\n", + "def get_rp_id(utt_lst):\n", + " return utt_lst[1].speaker.id\n", + "\n", + "def get_convo_id(utt_lst):\n", + " return utt_lst[0].get_conversation().id\n", + "\n", + "def get_convo_op_id(utt_lsts):\n", + " return get_op_id(utt_lsts[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "625ede32-8c16-4322-9003-dfa960bd077b", + "metadata": {}, + "outputs": [], + "source": [ + "### First, we find all the valid two speaker threads that is at correct length 5+, for each conversation.\n", + "convo_to_two_speaker_threads = {}\n", + "for convo in tqdm(corpus.iter_conversations()):\n", + " try:\n", + " all_utt_lsts = convo.get_root_to_leaf_paths()\n", + " except ValueError:\n", + " continue\n", + "\n", + " valid_two_speaker_utt_lsts = []\n", + " found_repliers = []\n", + " for utt_lst in all_utt_lsts:\n", + " if is_valid_convo(convo, utt_lst):\n", + " replier = get_replier_id(utt_lst)\n", + " if replier not in found_repliers:\n", + " valid_two_speaker_utt_lsts.append(utt_lst)\n", + " found_repliers.append(replier)\n", + " if valid_two_speaker_utt_lsts:\n", + " convo_to_two_speaker_threads[convo.id] = valid_two_speaker_utt_lsts\n", + "\n", + "len(convo_to_two_speaker_threads)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "565d9efc-2357-4a9b-a8b8-7191ee8461d5", + "metadata": {}, + "outputs": [], + "source": [ + "for convo_id, utt_lsts in convo_to_two_speaker_threads.items():\n", + " op = get_convo_op_id(utt_lsts)\n", + " for utt_lst in utt_lsts:\n", + " assert get_op_id(utt_lst) == op" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83596ac8-9c6f-45ed-8ca1-37f764c693a1", + "metadata": {}, + "outputs": [], + "source": [ + "### we find all the times the speaker is whatever role\n", + "random.seed(4300)\n", + "sp_to_convos = {}\n", + "sp_to_convos_id = {}\n", + "for convo_id, utt_lsts in convo_to_two_speaker_threads.items():\n", + " op = get_convo_op_id(utt_lsts)\n", + " if op not in sp_to_convos_id.keys() or convo_id not in sp_to_convos_id[op]:\n", + " if op not in sp_to_convos.keys():\n", + " sp_to_convos[op] = []\n", + " sp_to_convos[op].append(random.choice(utt_lsts))\n", + " if op not in sp_to_convos_id.keys():\n", + " sp_to_convos_id[op] = []\n", + " sp_to_convos_id[op].append(convo_id)\n", + " for utt_lst in utt_lsts:\n", + " rp = get_rp_id(utt_lst)\n", + " if op == rp: continue\n", + " if rp not in sp_to_convos_id.keys() or convo_id not in sp_to_convos_id[rp]:\n", + " if rp not in sp_to_convos.keys():\n", + " sp_to_convos[rp] = []\n", + " sp_to_convos[rp].append(utt_lst)\n", + " if rp not in sp_to_convos_id.keys():\n", + " sp_to_convos_id[rp] = []\n", + " sp_to_convos_id[rp].append(convo_id)\n", + "\n", + "enough_convo_sps = [sp for sp, lst in sp_to_convos.items() if len(lst) >= 2]\n", + "print(f\"number of speakers with two or more conversations: {len(enough_convo_sps)}\")\n", + "selected_convo_sps = random.sample(enough_convo_sps, 2000)\n", + "selected_convos_random_role = {}\n", + "for sp_id in selected_convo_sps:\n", + " selection_lst = sp_to_convos[sp_id]\n", + " assert len(selection_lst) >= 2\n", + " if len(selection_lst) == 2:\n", + " selected_convos_random_role.update({sp_id : selection_lst})\n", + " else:\n", + " selected_convos_random_role.update({sp_id : random.sample(selection_lst, 2)})\n", + "print(f\"selected number of speakers to compare: {len(selected_convos_random_role)}\")\n", + "\n", + "### Ensure no repeated convo selected for the same speaker\n", + "for sp_id, utt_lsts in selected_convos_random_role.items():\n", + " assert len(utt_lsts) == 2\n", + " assert utt_lsts[0][0].get_conversation().id != utt_lsts[1][0].get_conversation().id " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11e1b878-19d8-436c-a24b-13bb7e341352", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cur_convo_transcript(utt_lst):\n", + " transcription = []\n", + " spk_list = {utt_lst[0].speaker.id : \"SPEAKER1\"}\n", + " for utt in utt_lst:\n", + " if utt.speaker.id not in spk_list.keys():\n", + " spk_list[utt.speaker.id] = \"SPEAKER2\"\n", + " assert len(spk_list) == 2\n", + " transcription.append(spk_list[utt.speaker.id] +\": \"+utt.text)\n", + " transcription = transcription[1:] ### truncate OP first message\n", + " return transcription\n", + " \n", + "def get_bidirection_similarity_with_retry_utt_lst(corpus, convo1_id, convo2_id, utt_lst_1, utt_lst_2, summaries_and_bullets, incomplete, retries=10):\n", + " transcript1 = get_cur_convo_transcript(utt_lst_1)\n", + " transcript2 = get_cur_convo_transcript(utt_lst_2)\n", + " \n", + " scd1 = summaries_and_bullets[convo1_id]['summary']\n", + " scd2 = summaries_and_bullets[convo2_id]['summary']\n", + " \n", + " for i in range(retries):\n", + " try:\n", + " result = condyns.compute_bidirectional_similarity(transcript1, transcript2, scd1, scd2)\n", + " score = condyns.compute_score_from_results(result)\n", + " return score, result, incomplete\n", + " except Exception as e:\n", + " wait = 0.5 ** i + random.random()\n", + " print(f\"Retrying ({convo1_id}, {convo2_id}) after {wait:.2f}s due to error: {e}\")\n", + " incomplete.update(f'{convo1_id}_{convo2_id}')\n", + " time.sleep(wait)\n", + " return None, None, incomplete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09ee713c-b0db-4cf3-a668-0cb6c222f6f1", + "metadata": {}, + "outputs": [], + "source": [ + "def split_dict(d, n):\n", + " items = list(d.items())\n", + " chunk_size = math.ceil(len(items) / n)\n", + " return [dict(items[i:i + chunk_size]) for i in range(0, len(items), chunk_size)]\n", + "\n", + "# === Config and setup ===\n", + "all_need_to_compares = [selected_convos_random_role]\n", + "all_ROLE = [\"RANDOM_ROLE\"]\n", + "\n", + "for ROLE, need_to_compares in zip(all_ROLE, all_need_to_compares):\n", + " SAVE_PATH = f\"{BASE_PATH}/{CUR_ANALYSIS}/{ROLE}\"\n", + " os.makedirs(SAVE_PATH, exist_ok=True)\n", + " \n", + " # === Load previously saved data ===\n", + " def load_json(path, default):\n", + " return json.load(open(path)) if os.path.exists(path) else default\n", + " \n", + " summaries_and_bullets = load_json(f\"{SAVE_PATH}/summary.json\", {})\n", + " similarity_result = load_json(f\"{SAVE_PATH}/similarity.json\", {})\n", + " incomplete = set(load_json(f\"{SAVE_PATH}/incomplete.json\", []))\n", + " \n", + " # === Worker Function ===\n", + " def process_chunk(chunk):\n", + " local_summaries = {}\n", + " local_similarity = {}\n", + " local_incomplete = set()\n", + " \n", + " for sp_id, (utt_lst1, utt_lst2) in chunk.items():\n", + " id1 = f\"{sp_id}##{utt_lst1[0].get_conversation().id}\"\n", + " id2 = f\"{sp_id}##{utt_lst2[0].get_conversation().id}\"\n", + " \n", + " for idx, utt_lst in zip([id1, id2], [utt_lst1, utt_lst2]):\n", + " if idx in summaries_and_bullets:\n", + " continue\n", + " \n", + " # Get the conversation ID from the utterance list\n", + " convo_id = utt_lst[0].get_conversation().id\n", + " convo = corpus.get_conversation(convo_id)\n", + " \n", + " # Check if SCD/SoP already exists in conversation metadata\n", + " if \"machine_scd\" in convo.meta and \"machine_sop\" in convo.meta:\n", + " summary = convo.meta[\"machine_scd\"]\n", + " bulletpoint = convo.meta[\"machine_sop\"]\n", + " else:\n", + " # If not, we need to generate it first using the transformer\n", + " # Create a selector for just this conversation\n", + " def single_convo_selector(conversation):\n", + " return conversation.id == convo_id\n", + " \n", + " persuasion_scd_transformer.transform(corpus, selector=single_convo_selector)\n", + " summary = convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoint = convo.meta.get(\"machine_sop\", \"\")\n", + " \n", + " local_summaries[idx] = {\"summary\": summary, \"bulletpoint\": bulletpoint}\n", + " \n", + " id_pair_key_1 = f'{id1}_{id2}'\n", + " id_pair_key_2 = f'{id2}_{id1}'\n", + " \n", + " if id_pair_key_1 not in similarity_result and id_pair_key_1 not in incomplete \\\n", + " and id_pair_key_2 not in similarity_result and id_pair_key_2 not in incomplete:\n", + " score, result, inc = get_bidirection_similarity_with_retry_utt_lst(corpus, id1, id2, utt_lst1, utt_lst2, summaries_and_bullets, incomplete)\n", + " local_similarity[id_pair_key_1] = {\"score\": score, \"result\": result}\n", + " local_incomplete.update(inc)\n", + " \n", + " return local_summaries, local_similarity, local_incomplete\n", + " \n", + " # === Split work and run with threads ===\n", + " NUM_WORKERS = 25\n", + " chunks = split_dict(need_to_compares, NUM_WORKERS)\n", + " \n", + " with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:\n", + " futures = [executor.submit(process_chunk, chunk) for chunk in chunks]\n", + " \n", + " for future in tqdm(as_completed(futures), total=len(futures)):\n", + " local_summaries, local_similarity, local_incomplete = future.result()\n", + " summaries_and_bullets.update(local_summaries)\n", + " similarity_result.update(local_similarity)\n", + " incomplete.update(local_incomplete)\n", + " \n", + " # === Save to files ===\n", + " with open(f\"{SAVE_PATH}/summary.json\", \"w\") as file:\n", + " json.dump(summaries_and_bullets, file, indent=4)\n", + " \n", + " with open(f\"{SAVE_PATH}/similarity.json\", \"w\") as file:\n", + " json.dump(similarity_result, file, indent=4)\n", + " \n", + " with open(f\"{SAVE_PATH}/incomplete.json\", \"w\") as file:\n", + " json.dump(list(incomplete), file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b57ee36c-85d1-44f0-8576-c9f312022851", + "metadata": {}, + "outputs": [], + "source": [ + "need_to_compares = selected_convos_random_role\n", + "ROLE = \"RANDOM_ROLE\"\n", + "SAVE_PATH = f\"{BASE_PATH}/{CUR_ANALYSIS}/{ROLE}\"\n", + "\n", + "with open(f\"{SAVE_PATH}/similarity.json\", \"r\") as file:\n", + " similarity_result_random = json.load(file)\n", + "\n", + "RANDOM_similarity = []\n", + "\n", + "for sp_id, (utt_lst1, utt_lst2) in need_to_compares.items():\n", + " id1 = f\"{sp_id}##{utt_lst1[0].get_conversation().id}\"\n", + " id2 = f\"{sp_id}##{utt_lst2[0].get_conversation().id}\"\n", + " \n", + " sim_key = f'{id1}_{id2}' if f'{id1}_{id2}' in similarity_result_random.keys() else f'{id2}_{id1}'\n", + " assert sim_key in similarity_result_random.keys()\n", + " try:\n", + " RANDOM_similarity.append(np.mean(similarity_result_random[sim_key]['score']))\n", + " except TypeError:\n", + " pass\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a956dcd-77df-4992-93ee-51865bb1934c", + "metadata": {}, + "outputs": [], + "source": [ + "round(np.mean(RANDOM_similarity), 4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45b1a7d4", + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(4300)\n", + "all_convos_to_choose = list(convo_to_two_speaker_threads.keys())\n", + "random.shuffle(all_convos_to_choose)\n", + "selected_pairs = list(zip(all_convos_to_choose[::2], all_convos_to_choose[1::2]))[:2000]\n", + "selected_pairs_utt_lsts = {}\n", + "for i, (idx1, idx2) in enumerate(selected_pairs):\n", + " selected_pairs_utt_lsts.update({i : (random.choice(convo_to_two_speaker_threads[idx1]), random.choice(convo_to_two_speaker_threads[idx2]))})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e53f90b9", + "metadata": {}, + "outputs": [], + "source": [ + "def split_dict(d, n):\n", + " items = list(d.items())\n", + " chunk_size = math.ceil(len(items) / n)\n", + " return [dict(items[i:i + chunk_size]) for i in range(0, len(items), chunk_size)]\n", + "\n", + "# === Config and setup ===\n", + "\n", + "need_to_compares = selected_pairs_utt_lsts\n", + "ROLE = \"COMPLETELY_RANDOM_SPEAKER_AND_CONVERSATIONS\"\n", + "\n", + "SAVE_PATH = f\"{BASE_PATH}/{CUR_ANALYSIS}/{ROLE}\"\n", + "os.makedirs(SAVE_PATH, exist_ok=True)\n", + "\n", + "# === Load previously saved data ===\n", + "def load_json(path, default):\n", + " return json.load(open(path)) if os.path.exists(path) else default\n", + "\n", + "summaries_and_bullets = load_json(f\"{SAVE_PATH}/summary.json\", {})\n", + "similarity_result = load_json(f\"{SAVE_PATH}/similarity.json\", {})\n", + "incomplete = set(load_json(f\"{SAVE_PATH}/incomplete.json\", []))\n", + "\n", + "# === Worker Function ===\n", + "def process_chunk(chunk):\n", + " local_summaries = {}\n", + " local_similarity = {}\n", + " local_incomplete = set()\n", + "\n", + " for sp_id, (utt_lst1, utt_lst2) in chunk.items():\n", + " id1 = f\"{sp_id}##{utt_lst1[0].get_conversation().id}\"\n", + " id2 = f\"{sp_id}##{utt_lst2[0].get_conversation().id}\"\n", + "\n", + " for idx, utt_lst in zip([id1, id2], [utt_lst1, utt_lst2]):\n", + " if idx in summaries_and_bullets:\n", + " continue\n", + " # Get the conversation ID from the utterance list\n", + " convo_id = utt_lst[0].get_conversation().id\n", + " convo = corpus.get_conversation(convo_id)\n", + " \n", + " # Check if SCD/SoP already exists in conversation metadata\n", + " if \"machine_scd\" in convo.meta and \"machine_sop\" in convo.meta:\n", + " summary = convo.meta[\"machine_scd\"]\n", + " bulletpoint = convo.meta[\"machine_sop\"]\n", + " else:\n", + " # If not, we need to generate it first using the transformer\n", + " # Create a selector for just this conversation\n", + " def single_convo_selector(conversation):\n", + " return conversation.id == convo_id\n", + " \n", + " scd_transformer.transform(corpus, selector=single_convo_selector)\n", + " summary = convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoint = convo.meta.get(\"machine_sop\", \"\")\n", + " \n", + " local_summaries[idx] = {\"summary\": summary, \"bulletpoint\": bulletpoint}\n", + "\n", + " id_pair_key_1 = f'{id1}_{id2}'\n", + " id_pair_key_2 = f'{id2}_{id1}'\n", + "\n", + " if id_pair_key_1 not in similarity_result and id_pair_key_1 not in incomplete \\\n", + " and id_pair_key_2 not in similarity_result and id_pair_key_2 not in incomplete:\n", + " # Get the conversation IDs from utterance lists\n", + " convo_id1 = utt_lst1[0].get_conversation().id\n", + " convo_id2 = utt_lst2[0].get_conversation().id\n", + " \n", + " # Use the compare_conversations method if both conversations are in the same corpus\n", + " try:\n", + " result, score = condyns.compare_conversations(\n", + " corpus, convo_id1, convo_id2, \n", + " sop_meta_name=\"machine_sop\"\n", + " )\n", + " inc = set()\n", + " except Exception as e:\n", + " print(f\"Error comparing {convo_id1} and {convo_id2}: {e}\")\n", + " score, result, inc = None, None, {f'{id1}_{id2}'}\n", + " local_similarity[id_pair_key_1] = {\"score\": score, \"result\": result}\n", + " local_incomplete.update(inc)\n", + "\n", + " return local_summaries, local_similarity, local_incomplete\n", + "\n", + "# === Split work and run with threads ===\n", + "NUM_WORKERS = 25\n", + "chunks = split_dict(need_to_compares, NUM_WORKERS)\n", + "\n", + "with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:\n", + " futures = [executor.submit(process_chunk, chunk) for chunk in chunks]\n", + "\n", + " for future in tqdm(as_completed(futures), total=len(futures)):\n", + " local_summaries, local_similarity, local_incomplete = future.result()\n", + " summaries_and_bullets.update(local_summaries)\n", + " similarity_result.update(local_similarity)\n", + " incomplete.update(local_incomplete)\n", + "\n", + "# === Save to files ===\n", + "with open(f\"{SAVE_PATH}/summary.json\", \"w\") as file:\n", + " json.dump(summaries_and_bullets, file, indent=4)\n", + "\n", + "with open(f\"{SAVE_PATH}/similarity.json\", \"w\") as file:\n", + " json.dump(similarity_result, file, indent=4)\n", + "\n", + "with open(f\"{SAVE_PATH}/incomplete.json\", \"w\") as file:\n", + " json.dump(list(incomplete), file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c628e342", + "metadata": {}, + "outputs": [], + "source": [ + "need_to_compares = selected_pairs_utt_lsts\n", + "ROLE = \"COMPLETELY_RANDOM_SPEAKER_AND_CONVERSATIONS\"\n", + "SAVE_PATH = f\"{BASE_PATH}/{CUR_ANALYSIS}/{ROLE}\"\n", + "\n", + "with open(f\"{SAVE_PATH}/similarity.json\", \"r\") as file:\n", + " similarity_result_c_random = json.load(file)\n", + "\n", + "COMPLETE_RANDOME_similarity = []\n", + "\n", + "for sp_id, (utt_lst1, utt_lst2) in need_to_compares.items():\n", + " id1 = f\"{sp_id}##{utt_lst1[0].get_conversation().id}\"\n", + " id2 = f\"{sp_id}##{utt_lst2[0].get_conversation().id}\"\n", + " \n", + " sim_key = f'{id1}_{id2}' if f'{id1}_{id2}' in similarity_result_c_random.keys() else f'{id2}_{id1}'\n", + " assert sim_key in similarity_result_c_random.keys()\n", + " try:\n", + " COMPLETE_RANDOME_similarity.append(np.mean(similarity_result_c_random[sim_key]['score']))\n", + " except TypeError:\n", + " print(similarity_result_c_random[sim_key]['score'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7af825ef", + "metadata": {}, + "outputs": [], + "source": [ + "np.mean(COMPLETE_RANDOME_similarity)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28b3b407", + "metadata": {}, + "outputs": [], + "source": [ + "stat, p_value = mannwhitneyu(COMPLETE_RANDOME_similarity, RANDOM_similarity, alternative='two-sided')\n", + "print(f\"Mann Whitney statistic: {stat}\")\n", + "print(f\"p-value: {p_value}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1c89b373", + "metadata": {}, + "source": [ + "## OP or Challenger Drive the Dynamics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a54b28d", + "metadata": {}, + "outputs": [], + "source": [ + "sp_op_convos = {}\n", + "for convo_id, utt_lsts in convo_to_two_speaker_threads.items():\n", + " for utt_lst in utt_lsts:\n", + " op = get_op_id(utt_lst)\n", + " if op not in sp_op_convos.keys():\n", + " sp_op_convos[op] = []\n", + " sp_op_convos[op].append(utt_lst)\n", + "\n", + "sp_rp_convos = {}\n", + "for convo_id, utt_lsts in convo_to_two_speaker_threads.items():\n", + " for utt_lst in utt_lsts:\n", + " rp = get_replier_id(utt_lst)\n", + " if rp not in sp_rp_convos.keys():\n", + " sp_rp_convos[rp] = []\n", + " sp_rp_convos[rp].append(utt_lst)\n", + "\n", + "valid_speaker = [op for op, lst in sp_op_convos.items() if len(lst) >= 2 and op in sp_rp_convos.keys() and len(sp_rp_convos[op]) >= 2]\n", + "random.seed(4300)\n", + "random.shuffle(valid_speaker)\n", + "\n", + "for v in sp_op_convos.values():\n", + " random.shuffle(v)\n", + "\n", + "for v in sp_rp_convos.values():\n", + " random.shuffle(v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bddeabba", + "metadata": {}, + "outputs": [], + "source": [ + "selected_pairs = [] # (sp_id, convo1, convo2, convo3, convo4)\n", + "used_convos = []\n", + "used_sps = []\n", + "\n", + "for sp_id in valid_speaker:\n", + " if sp_id in used_sps: continue\n", + " op_convos = sp_op_convos[sp_id]\n", + " rp_convos = sp_rp_convos[sp_id]\n", + " selected_op_convos = []\n", + " selected_rp_convos = []\n", + "\n", + " local_used_sps = []\n", + " local_used_convos = []\n", + "\n", + " for utt_lst in op_convos:\n", + " assert is_valid_convo(utt_lst[0].get_conversation(), utt_lst)\n", + " convo_id = get_convo_id(utt_lst)\n", + " if convo_id not in used_convos and convo_id not in local_used_convos:\n", + " rp = get_replier_id(utt_lst)\n", + " all_sps = get_all_speakers(utt_lst)\n", + " if all([x not in used_sps for x in all_sps]) and (rp not in local_used_sps):\n", + " selected_op_convos.append(utt_lst)\n", + " local_used_sps.append(rp)\n", + " local_used_convos.append(convo_id)\n", + " if len(selected_op_convos) == 2:\n", + " break\n", + "\n", + " for utt_lst in rp_convos:\n", + " assert is_valid_convo(utt_lst[0].get_conversation(), utt_lst)\n", + " convo_id = utt_lst[0].get_conversation().id\n", + " if convo_id not in used_convos and convo_id not in local_used_convos:\n", + " op = get_op_id(utt_lst)\n", + " all_sps = get_all_speakers(utt_lst)\n", + " if all([x not in used_sps for x in all_sps]) and (op not in local_used_sps):\n", + " selected_rp_convos.append(utt_lst)\n", + " local_used_sps.append(op)\n", + " local_used_convos.append(convo_id)\n", + " if len(selected_rp_convos) == 2:\n", + " break\n", + "\n", + " if len(selected_op_convos) == 2 and len(selected_rp_convos) == 2:\n", + " selected_pairs.append((sp_id, selected_op_convos, selected_rp_convos))\n", + " used_convos.extend(local_used_convos)\n", + " local_used_sps.append(sp_id)\n", + " # used_sps.extend(local_used_sps)\n", + " convo1, convo2 = selected_op_convos\n", + " convo3, convo4 = selected_rp_convos\n", + " all_local_speakers = get_all_speakers(convo1) + get_all_speakers(convo2) + get_all_speakers(convo3) + get_all_speakers(convo4)\n", + " all_local_speakers = list(set(all_local_speakers))\n", + " used_sps.extend(all_local_speakers)\n", + "\n", + "len(selected_pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81d1bc1a", + "metadata": {}, + "outputs": [], + "source": [ + "### Validate that no duplicate convo or speaker selected.\n", + "selected_convos = []\n", + "selected_sps = []\n", + "for sp_id, (convo1, convo2), (convo3, convo4) in selected_pairs:\n", + " selected_convos.extend([get_convo_id(convo1), get_convo_id(convo2), get_convo_id(convo3), get_convo_id(convo4)])\n", + " all_local_speakers = get_all_speakers(convo1) + get_all_speakers(convo2) + get_all_speakers(convo3) + get_all_speakers(convo4)\n", + " all_local_speakers = list(set(all_local_speakers))\n", + " assert len(all_local_speakers) == 5\n", + " selected_sps.extend(all_local_speakers)\n", + "\n", + "assert len(selected_convos) == len(list(set(selected_convos)))\n", + "assert len(selected_sps) == len(list(set(selected_sps)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9b0ab72", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cur_convo_transcript(corpus, utt_lst):\n", + " # utt_list = [corpus.get_utterance(utt_id) for utt_id in utt_lst]\n", + " transcription = []\n", + " spk_list = {utt_lst[0].speaker.id : \"SPEAKER1\"}\n", + " for utt in utt_lst:\n", + " if utt.speaker.id not in spk_list.keys():\n", + " spk_list[utt.speaker.id] = \"SPEAKER2\"\n", + " assert len(spk_list) == 2\n", + " transcription.append(spk_list[utt.speaker.id] +\": \"+utt.text)\n", + " transcription = transcription[1:] ### truncate OP first message\n", + " return transcription" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9a4649c", + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(4300)\n", + "need_to_compare_simiarity = {}\n", + "\n", + "for sp_id, selected_op_convos, selected_rp_convos in selected_pairs:\n", + " op1, op2 = selected_op_convos\n", + " rp1, rp2 = selected_rp_convos\n", + "\n", + " key = f\"{sp_id}\"\n", + " need_to_compare_simiarity[key] = {}\n", + "\n", + " need_to_compare_simiarity[key][\"op\"] = (op1, op2)\n", + " need_to_compare_simiarity[key][\"rp\"] = (rp1, rp2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f48099ae", + "metadata": {}, + "outputs": [], + "source": [ + "BASE_PATH = \"YOUR_PATH\"\n", + "\n", + "if not os.path.exists(f\"{BASE_PATH}\"):\n", + " print(\"Making directory: \", f\"{BASE_PATH}\")\n", + " os.makedirs(f\"{BASE_PATH}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "732ae869", + "metadata": {}, + "outputs": [], + "source": [ + "# ======= Load previously saved files =======\n", + "from convokit.convo_similarity.utils import format_transcript_from_convokit_utt_lst\n", + "\n", + "\n", + "if os.path.exists(f\"{BASE_PATH}summary.json\"):\n", + " with open(f\"{BASE_PATH}summary.json\", \"r\") as file:\n", + " summaries_and_bullets = json.load(file)\n", + "else:\n", + " summaries_and_bullets = {}\n", + "\n", + "if os.path.exists(f\"{BASE_PATH}similarity.json\"):\n", + " with open(f\"{BASE_PATH}similarity.json\", \"r\") as file:\n", + " similarity_result = json.load(file)\n", + "else:\n", + " similarity_result = {}\n", + "\n", + "if os.path.exists(f\"{BASE_PATH}incomplete.json\"):\n", + " with open(f\"{BASE_PATH}incomplete.json\", \"r\") as file:\n", + " incomplete = set(json.load(file))\n", + "else:\n", + " incomplete = set()\n", + "\n", + "# ======= Define thread worker function =======\n", + "def process_key(key, need_to_compares):\n", + " local_summaries = {}\n", + " local_similarity = {}\n", + " local_incomplete = set()\n", + "\n", + " for pair_id, (utt_lst1, utt_lst2) in need_to_compares.items():\n", + " if pair_id == \"op\":\n", + " id1 = f\"{key}###op1\"\n", + " id2 = f\"{key}###op2\"\n", + " elif pair_id == \"rp\":\n", + " id1 = f\"{key}###rp1\"\n", + " id2 = f\"{key}###rp2\"\n", + " else:\n", + " raise ValueError\n", + "\n", + " for idx, utt_lst in zip([id1, id2], [utt_lst1, utt_lst2]):\n", + " if idx in summaries_and_bullets:\n", + " continue\n", + " transcript = format_transcript_from_convokit_utt_lst(corpus, utt_lst)\n", + " # Check if we already have this data stored\n", + " if idx in summaries_and_bullets:\n", + " local_summaries[idx] = summaries_and_bullets[idx]\n", + " else:\n", + " # This case should not happen if SCDs are generated beforehand\n", + " # If it does, we need to generate SCD/SoP from the transcript\n", + " # But this should be done before the similarity computation phase\n", + " raise ValueError(f\"SCD/SoP not found for {idx}. Please generate SCDs beforehand.\")\n", + "\n", + " id_pair_key_1 = f'{id1}_{id2}'\n", + " id_pair_key_2 = f'{id2}_{id1}'\n", + "\n", + " if id_pair_key_1 not in similarity_result and id_pair_key_1 not in incomplete \\\n", + " and id_pair_key_2 not in similarity_result and id_pair_key_2 not in incomplete:\n", + " # Get the conversation IDs from utterance lists\n", + " convo_id1 = utt_lst1[0].get_conversation().id\n", + " convo_id2 = utt_lst2[0].get_conversation().id\n", + " \n", + " # Use the compare_conversations method\n", + " try:\n", + " result, score = condyns.compare_conversations(\n", + " corpus, convo_id1, convo_id2, \n", + " sop_meta_name=\"machine_sop\"\n", + " )\n", + " inc = set()\n", + " except Exception as e:\n", + " print(f\"Error comparing {convo_id1} and {convo_id2}: {e}\")\n", + " score, result, inc = None, None, {f'{id1}_{id2}'}\n", + " local_similarity[id_pair_key_1] = {\"score\": score, \"result\": result}\n", + " local_incomplete.update(inc)\n", + "\n", + " return local_summaries, local_similarity, local_incomplete\n", + "\n", + "# ======= Run multi-threaded processing =======\n", + "with ThreadPoolExecutor(max_workers=25) as executor:\n", + " futures = [\n", + " executor.submit(process_key, key, need_to_compares)\n", + " for key, need_to_compares in need_to_compare_simiarity.items()\n", + " ]\n", + "\n", + " for future in tqdm(as_completed(futures), total=len(futures)):\n", + " local_summaries, local_similarity, local_incomplete = future.result()\n", + " summaries_and_bullets.update(local_summaries)\n", + " similarity_result.update(local_similarity)\n", + " incomplete.update(local_incomplete)\n", + "\n", + "# ======= Save back to files =======\n", + "with open(f\"{BASE_PATH}summary.json\", \"w\") as file:\n", + " json.dump(summaries_and_bullets, file, indent=4)\n", + "\n", + "with open(f\"{BASE_PATH}similarity.json\", \"w\") as file:\n", + " json.dump(similarity_result, file, indent=4)\n", + "\n", + "with open(f\"{BASE_PATH}incomplete.json\", \"w\") as file:\n", + " json.dump(list(incomplete), file, indent=4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0601e98d", + "metadata": {}, + "outputs": [], + "source": [ + "op_similarity = []\n", + "rp_similarity = []\n", + "\n", + "for key, need_to_compares in tqdm(need_to_compare_simiarity.items()):\n", + " for pair_id, _ in need_to_compares.items():\n", + " if pair_id == \"op\":\n", + " id1 = f\"{key}###op1\"\n", + " id2 = f\"{key}###op2\"\n", + " sim_key = f'{id1}_{id2}' if f'{id1}_{id2}' in similarity_result.keys() else f'{id2}_{id1}'\n", + " assert sim_key in similarity_result.keys()\n", + " op_similarity.append(np.mean(similarity_result[sim_key]['score']))\n", + " elif pair_id == \"rp\":\n", + " id1 = f\"{key}###rp1\"\n", + " id2 = f\"{key}###rp2\"\n", + " sim_key = f'{id1}_{id2}' if f'{id1}_{id2}' in similarity_result.keys() else f'{id2}_{id1}'\n", + " assert sim_key in similarity_result.keys()\n", + " rp_similarity.append(np.mean(similarity_result[sim_key]['score']))\n", + " else:\n", + " raise ValueError" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8577ad1d", + "metadata": {}, + "outputs": [], + "source": [ + "round(np.mean(op_similarity), 4), round(np.mean(rp_similarity), 4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dc6f652", + "metadata": {}, + "outputs": [], + "source": [ + "t_stat, p_value = stats.ttest_ind(op_similarity, rp_similarity, equal_var=False)\n", + "print(f\"T-Test p-value {p_value:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "521ad3ec", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import wilcoxon\n", + "\n", + "# Make sure they're lists of the same length\n", + "assert len(op_similarity) == len(rp_similarity)\n", + "\n", + "stat, p_value = wilcoxon(op_similarity, rp_similarity, alternative='two-sided')\n", + "\n", + "print(f\"Wilcoxon statistic: {stat}\")\n", + "print(f\"One-sided p-value: {p_value}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "balance", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/convokit/convo_similarity/examples/applications/friends/friends_condyns.ipynb b/convokit/convo_similarity/examples/applications/friends/friends_condyns.ipynb new file mode 100644 index 00000000..eb659113 --- /dev/null +++ b/convokit/convo_similarity/examples/applications/friends/friends_condyns.ipynb @@ -0,0 +1,585 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6f96a9a8", + "metadata": {}, + "source": [ + "# Application of ConDynS on Friends Dataset\n", + "\n", + "Dataset information can be found: https://convokit.cornell.edu/documentation/friends.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "026d45fb-18f6-42ed-ab8e-74ef07c19cca", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from convokit import Corpus, download\n", + "from tqdm import tqdm\n", + "import matplotlib.pyplot as plt\n", + "import ast\n", + "import nltk\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import sent_tokenize\n", + "import numpy as np\n", + "import re\n", + "import scipy.stats as stats\n", + "from itertools import combinations\n", + "import random\n", + "import string\n", + "import math\n", + "from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n", + "from scipy.spatial.distance import squareform\n", + "from sklearn.feature_extraction.text import CountVectorizer as CV\n", + "import string\n", + "\n", + "from convokit.genai.genai_config import GenAIConfigManager\n", + "from convokit.convo_similarity.scd import SCD\n", + "from convokit.convo_similarity.condyns import ConDynS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d774f366-92a2-47ba-a450-a5b67f1a3c3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading friends-corpus to /reef/sj597_kz88/scd-sim/wiki_exploration/friends-corpus\n", + "Downloading friends-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/friends-corpus/friends-corpus.zip (6.1MB)... Done\n" + ] + } + ], + "source": [ + "corpus = Corpus(filename=download(\"friends-corpus\", data_dir = \"YOUR DATA PATH\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "201581ef-6ccc-4b52-a412-b305f0ef5a27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Monica Geller : Here you go. You can wear this.\n", + "Phoebe Buffay : Thanks!\n", + "Hold Voice : Please, stay on the line. Your call is important to us.\n", + "Chandler Bing : Hey! Can you take a duck and a chick to the theatre?\n" + ] + } + ], + "source": [ + "convo = corpus.random_conversation()\n", + "utt_lst = convo.get_utterance_ids()\n", + "speaker_ids = {}\n", + "transcript = \"\"\n", + "for utt_id in utt_lst:\n", + " utt = corpus.get_utterance(utt_id)\n", + " if \"TRANSCRIPT_NOTE\" not in utt.speaker.id:\n", + " if utt.speaker.id not in speaker_ids:\n", + " print(utt.speaker.id, \":\", utt.text)\n", + " speaker_ids[utt.speaker.id] = 1 + len(speaker_ids)\n", + " transcript += \"Speaker\"+str(speaker_ids[utt.speaker.id]) + \" : \" + utt.text+ \"\\n\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44744176", + "metadata": {}, + "outputs": [], + "source": [ + "### Setup path for data and corpus ###\n", + "\n", + "DATA_PATH = \"./data\"\n", + "filepath = DATA_PATH + \"PATH TO WIKI GERMAN DATA\"\n", + "\n", + "### Set up config for GenAI ###\n", + "config = GenAIConfigManager() ### make sure to set your own config if this is never set before\n", + "\n", + "### Select which model provider to use for ConDynS ###\n", + "MODEL_PROVIDER = \"gemini\"\n", + "MODEL = \"gemini-2.0-flash-001\"\n", + "config.set_google_cloud_config(\"YOUR PROJECT\", \"YOUR LOCATION\")\n", + "\n", + "with open(filepath, \"r\") as f:\n", + " dataset = json.load(f)\n", + "\n", + "random.seed(4300)\n", + "dataset = random.sample(dataset, 100)\n", + "len(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a318dd33-d2e9-4fec-9e1d-79b97c4c5166", + "metadata": {}, + "outputs": [], + "source": [ + "### Prepare your own prompt for writing the SCD with your data\n", + "friends_summary_prompt = \"\"\"\n", + "Write a short summary capturing the trajectory of a casual conversation. \n", + "Do not include specific topics, events, or arguments from the conversation. The style you should avoid is illustrated in \n", + "Example Sentence 1: “Speaker1 said they had a difficult day at work, and mentioned that their boss was unfair. Speaker2 listened and agreed that bosses can be tough, then suggested they go out for dinner to forget about it..” Instead, you should include indicators of sentiments (e.g., warmth, empathy, humor, nostalgia, vulnerability, support), individual intentions (e.g., building rapport, offering reassurance, seeking validation, self-disclosure, active listening, gentle disagreement, creating distance), and conversational strategies (if any) such as “collaborative storytelling,” “inside jokes,” “mirroring emotions,” and “affectionate teasing.” \n", + "The following sentences demonstrate the style you should follow: \n", + "Example Sentence 2: “Both speakers have similar feelings and appeared mutually supportive. Speaker1 initiates with a moment of self-disclosure, and Speaker2 responds with empathy and validation. Both speakers build on this exchange, strengthening their rapport.” \n", + "Example Sentence 3: “The two speakers connected with back-and-forth affectionate teasing. Throughout the conversation, they kept building on each other's humor with playful remarks, creating a lighthearted and comfortable discussion.” Overall, the trajectory summary should capture the key moments where the emotional connection of the conversation notably changes. Here is an example of a complete trajectory summary: The conversation begins with two speakers exchanging neutral, surface-level comments. Speaker1 then shifts the tone by sharing a personal anecdote, prompting Speaker2 to respond with warmth and empathy. Speaker1 elaborates on their story and their need, but Speaker2 does not extend their support but retracts it. \n", + "Now, provide the trajectory summary for the following conversation. \n", + "Conversation Transcript: {formatted_object}. \n", + "Now, summarize this conversation. Remember, do not include specific topics, claims, or arguments from the conversation. Instead, try to capture the speakers' sentiments, intentions, and conversational/persuasive strategies. Limit the trajectory summary to 80 words. \n", + "Trajectory Summary:\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ba0364b", + "metadata": {}, + "outputs": [], + "source": [ + "friends_sop_prompt = \"\"\"\n", + "Here is a trajectory summary of a conversation that lays out how the dynamics of the conversation developed. You need to parse the summary into events in order. \n", + "Follow the following guidelines:\n", + "1. Try to maintain the original language of the summary as much as you can. \n", + "2. Provide your output as a Python dictionary with the following structure:\n", + "_(Note: Do NOT use markdown, JSON formatting, or code block delimiters.)_ \n", + "{{\n", + " '0': \"\" // description of the event\n", + " '1': ...\n", + " ...\n", + "}}\n", + "Here is the summary:\n", + "{formatted_object}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c60c3627-f521-4dde-99b4-2be056c48945", + "metadata": {}, + "outputs": [], + "source": [ + "scd_transformer = SCD(model_provider=MODEL_PROVIDER, \n", + " model=MODEL, \n", + " config=config, \n", + " custom_scd_prompt=friends_summary_prompt, \n", + " custom_sop_prompt=friends_sop_prompt,\n", + " custom_prompt_dir=\"friends_prompts\")\n", + "condyns = ConDynS(model_provider=MODEL_PROVIDER, \n", + " model=MODEL, \n", + " config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "097b32c5-73a9-455e-bb97-1871259a3622", + "metadata": {}, + "outputs": [], + "source": [ + "def format_friends_transcript_from_convokit(corpus, convo_id):\n", + " convo = corpus.get_conversation(convo_id)\n", + " utt_lst = convo.get_utterance_ids()\n", + " speaker_ids = {}\n", + " transcript = \"\"\n", + " for utt_id in utt_lst:\n", + " utt = corpus.get_utterance(utt_id)\n", + " if \"TRANSCRIPT_NOTE\" not in utt.speaker.id:\n", + " if utt.speaker.id not in speaker_ids:\n", + " speaker_ids[utt.speaker.id] = 1 + len(speaker_ids)\n", + " transcript += \"Speaker\"+str(speaker_ids[utt.speaker.id]) + \" : \" + utt.text+ \"\\n\\n\"\n", + " return transcript\n", + "\n", + "def count_real_utterance_num(convo_id):\n", + " convo = corpus.get_conversation(convo_id)\n", + " utt_lst = convo.get_utterance_ids()\n", + " count = 0\n", + " for utt_id in utt_lst:\n", + " utt = corpus.get_utterance(utt_id)\n", + " if \"TRANSCRIPT_NOTE\" not in utt.speaker.id:\n", + " count += 1\n", + " return count" + ] + }, + { + "cell_type": "markdown", + "id": "de77132b-b86b-402f-902e-4d969a92b360", + "metadata": {}, + "source": [ + "# Generating Sop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04052281-6acf-443d-a00a-682eed209cd2", + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(4300)\n", + "convo_ids = []\n", + "while len(convo_ids) < 100:\n", + " convo_id = random.choice(corpus.get_conversation_ids())\n", + " if count_real_utterance_num(convo_id) >= 4:\n", + " convo_ids.append(convo_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9c24b32-8c21-4c24-8149-d6723e46ba86", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating SCDs for conversations: 100%|██████| 100/100 [01:47<00:00, 1.07s/it]\n" + ] + } + ], + "source": [ + "# Create a custom formatter for Friends conversations\n", + "def format_friends_conversation(conversation):\n", + " return format_friends_transcript_from_convokit(corpus, conversation.id)\n", + "\n", + "# Set the custom formatter\n", + "scd_transformer.conversation_formatter = format_friends_conversation\n", + "\n", + "# Transform the corpus to generate SCDs and SoPs\n", + "def convo_selector(conversation):\n", + " return conversation.id in convo_ids\n", + "\n", + "scd_transformer.transform(corpus, selector=convo_selector)\n", + "\n", + "# Extract the results from conversation metadata\n", + "time_analysis_scd = {}\n", + "bulletpoints = {}\n", + "for convo_id in convo_ids:\n", + " convo = corpus.get_conversation(convo_id)\n", + " time_analysis_scd[convo_id] = convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoints[convo_id] = convo.meta.get(\"machine_sop\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43597d39-7f34-436f-a895-f0fc3a0d7dfd", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + f\"friends_100_scd.json\", 'w') as file:\n", + " json.dump(time_analysis_scd, file, indent=4)\n", + "\n", + "with open(DATA_PATH + f\"friends_100_sop.json\", 'w') as file:\n", + " json.dump(bulletpoints, file, indent=4)" + ] + }, + { + "cell_type": "markdown", + "id": "cb477181-fc75-4664-916c-05230bc35b47", + "metadata": {}, + "source": [ + "# Calculate scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e3ae109-5ac0-446e-b7b2-17ae4a5556d4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Calculating pairs similarity: 100%|█████████| 1225/1225 [11:05<00:00, 1.84it/s]\n" + ] + } + ], + "source": [ + "num = 50\n", + "all_combos = list(combinations(convo_ids[:num], 2))\n", + "convo_scores = {}\n", + "\n", + "# Set custom formatter for ConDynS\n", + "condyns_formatter = lambda conversation: format_friends_transcript_from_convokit(corpus, conversation.id)\n", + "\n", + "for convo_id1, convo_id2 in tqdm(all_combos, desc=\"Calculating pairs similarity\"):\n", + " if convo_id1 + \"_\" + convo_id2 in convo_scores or convo_id2 + \"_\" + convo_id1 in convo_scores:\n", + " continue\n", + " \n", + " # Use the new compare_conversations method\n", + " result, score = condyns.compare_conversations(\n", + " corpus, convo_id1, convo_id2, \n", + " sop_meta_name=\"machine_sop\",\n", + " formatter=condyns_formatter\n", + " )\n", + " \n", + " convo_scores[convo_id1 + \"_\" + convo_id2] = {\"result\": result, \"score\": score}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "202c6b08-5fb7-4cee-941a-a1576be95aa1", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + f\"friends_50_scores.json\", 'w') as file:\n", + " json.dump(convo_scores, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "657f13ab-57fd-45e8-9001-97be28310f53", + "metadata": {}, + "outputs": [], + "source": [ + "def get_similarity(convo_id1, convo_id2):\n", + " if convo_id1 + \"_\" + convo_id2 in convo_scores:\n", + " return convo_scores[convo_id1 + \"_\" + convo_id2][\"score\"]\n", + " elif convo_id2 + \"_\" + convo_id1 in convo_scores:\n", + " return convo_scores[convo_id2 + \"_\" + convo_id1][\"score\"]\n", + " else:\n", + " print(\"Did not find the score\")\n", + " return" + ] + }, + { + "cell_type": "markdown", + "id": "813e0b78", + "metadata": {}, + "source": [ + "### Clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac0ca1ec-5f81-46dd-88ba-2a45d77909a5", + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create the distance matrix\n", + "n = len(convo_ids[:num])\n", + "distance_matrix = np.zeros((n, n))\n", + "\n", + "# Fill the distance matrix\n", + "for i in range(n):\n", + " for j in range(i + 1, n):\n", + " convo1, convo2 = convo_ids[i], convo_ids[j]\n", + " similarity = np.sum(get_similarity(convo1, convo2))\n", + " distance = 2 - similarity # Convert similarity to distance\n", + " distance_matrix[i, j] = distance_matrix[j, i] = distance # Symmetric matrix\n", + "\n", + "# Convert to condensed format for linkage function\n", + "condensed_dist_matrix = squareform(distance_matrix)\n", + "\n", + "# Step 2: Perform hierarchical clustering\n", + "linkage_matrix = linkage(condensed_dist_matrix, method=\"ward\") # Ward's method minimizes variance\n", + "\n", + "top_level_clusters = fcluster(linkage_matrix, t=2, criterion='maxclust')\n", + "\n", + "clusters = defaultdict(list)\n", + "for idx, label in enumerate(top_level_clusters):\n", + " clusters[label].append(idx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f074175d", + "metadata": {}, + "outputs": [], + "source": [ + "# from https://github.com/jmhessel/FightingWords/blob/master/fighting_words_py3.py\n", + "exclude = set(string.punctuation)\n", + "\n", + "def basic_sanitize(in_string):\n", + " '''Returns a very roughly sanitized version of the input string.'''\n", + " in_string = ''.join([ch for ch in in_string if ch not in exclude])\n", + " in_string = in_string.lower()\n", + " in_string = ' '.join(in_string.split())\n", + " return in_string\n", + "\n", + "def bayes_compare_language(l1, l2, ngram = 1, prior=.01, cv = None):\n", + " '''\n", + " Arguments:\n", + " - l1, l2; a list of strings from each language sample\n", + " - ngram; an int describing up to what n gram you want to consider (1 is unigrams,\n", + " 2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.\n", + " - prior; either a float describing a uniform prior, or a vector describing a prior\n", + " over vocabulary items. If you're using a predefined vocabulary, make sure to specify that\n", + " when you make your CountVectorizer object.\n", + " - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.\n", + "\n", + " Returns:\n", + " - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''\n", + " if cv is None and type(prior) is not float:\n", + " print(\"If using a non-uniform prior:\")\n", + " print(\"Please also pass a count vectorizer with the vocabulary parameter set.\")\n", + " quit()\n", + " l1 = [basic_sanitize(l) for l in l1]\n", + " l2 = [basic_sanitize(l) for l in l2]\n", + " if cv is None:\n", + " cv = CV(decode_error = 'ignore', min_df=2, max_df=0.9, ngram_range=(1,ngram),\n", + " binary = False,\n", + " max_features = 15000)\n", + " counts_mat = cv.fit_transform(l1+l2).toarray()\n", + " # Now sum over languages...\n", + " vocab_size = len(cv.vocabulary_)\n", + " print(\"Vocab size is {}\".format(vocab_size))\n", + " if type(prior) is float:\n", + " priors = np.array([prior for i in range(vocab_size)])\n", + " else:\n", + " priors = prior\n", + " z_scores = np.empty(priors.shape[0])\n", + " count_matrix = np.empty([2, vocab_size], dtype=np.float32)\n", + " count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)\n", + " count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)\n", + " a0 = np.sum(priors)\n", + " n1 = 1.*np.sum(count_matrix[0,:])\n", + " n2 = 1.*np.sum(count_matrix[1,:])\n", + " print(\"Comparing language...\")\n", + " for i in range(vocab_size):\n", + " #compute delta\n", + " term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))\n", + " term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))\n", + " delta = term1 - term2\n", + " #compute variance on delta\n", + " var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])\n", + " #store final score\n", + " z_scores[i] = delta/np.sqrt(var)\n", + " index_to_term = {v:k for k,v in cv.vocabulary_.items()}\n", + " sorted_indices = np.argsort(z_scores)\n", + " return_list = []\n", + " for i in sorted_indices:\n", + " return_list.append((index_to_term[i], z_scores[i]))\n", + " return return_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53140ae7", + "metadata": {}, + "outputs": [], + "source": [ + "def get_fighting_words_matching_bullets(cluster1, cluster2, similarity_result=convo_scores, summaries_and_bullets=similarity_and_bulletpoints):\n", + " cluster1_combo = list(combinations(cluster1, 2))\n", + " matched_cluster1 = []\n", + " for convo_id1, convo_id2 in cluster1_combo:\n", + " key = f\"{convo_id1}_{convo_id2}\" if f\"{convo_id1}_{convo_id2}\" in similarity_result.keys() else f\"{convo_id2}_{convo_id1}\"\n", + " for k, result in enumerate(similarity_result[key][\"result\"]):\n", + " for index in result.keys():\n", + " if result[index]['score'] > 0.5:\n", + " if k == 0:\n", + " matched_cluster1.append(summaries_and_bullets['bulletpoints'][convo_id1][index])\n", + " else:\n", + " try:\n", + " matched_cluster1.append(summaries_and_bullets['bulletpoints'][convo_id2][index])\n", + " except Exception:\n", + " continue\n", + " \n", + " cluster2_combo = list(combinations(cluster2, 2))\n", + " matched_cluster2 = []\n", + " for convo_id1, convo_id2 in cluster2_combo:\n", + " key = f\"{convo_id1}_{convo_id2}\" if f\"{convo_id1}_{convo_id2}\" in similarity_result.keys() else f\"{convo_id2}_{convo_id1}\"\n", + " for k, result in enumerate(similarity_result[key][\"result\"]):\n", + " for index in result.keys():\n", + " if result[index]['score'] > 0.5:\n", + " if k == 0:\n", + " matched_cluster2.append(summaries_and_bullets['bulletpoints'][convo_id1][index])\n", + " else:\n", + " matched_cluster2.append(summaries_and_bullets['bulletpoints'][convo_id2][index])\n", + " \n", + " z_scores = bayes_compare_language(matched_cluster1, matched_cluster2, ngram = 3) \n", + " top_k = 15\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " print(f\"Fighting Words Comments between:\")\n", + " print(\"Cluster1: \", top_k_class1)\n", + " print(\"Cluster2: \", top_k_class2)\n", + " return matched_cluster1, matched_cluster2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61420259-1d1c-4b5e-9dce-ce416739a8d6", + "metadata": {}, + "outputs": [], + "source": [ + "cluster1_ids = [convo_ids[i] for i in clusters[1]]\n", + "cluster2_ids = [convo_ids[i] for i in clusters[2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bc70072", + "metadata": {}, + "outputs": [], + "source": [ + "cluster1_bulletpoints, cluster2_bulletpoints = get_fighting_words_matching_bullets(cluster1_ids, cluster2_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64b8efe9", + "metadata": {}, + "outputs": [], + "source": [ + "cluster1_bulletpoints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "061bafb2", + "metadata": {}, + "outputs": [], + "source": [ + "cluster2_bulletpoints" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sj597-env", + "language": "python", + "name": "sj597-env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/convokit/convo_similarity/examples/applications/wiki_german/wiki_german_condyns.ipynb b/convokit/convo_similarity/examples/applications/wiki_german/wiki_german_condyns.ipynb new file mode 100644 index 00000000..804c8481 --- /dev/null +++ b/convokit/convo_similarity/examples/applications/wiki_german/wiki_german_condyns.ipynb @@ -0,0 +1,3060 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7a0787fb", + "metadata": {}, + "source": [ + "# Application of ConDynS on WikiConv German\n", + "\n", + "Dataset information can be found: https://convokit.cornell.edu/documentation/wikiconv.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f986bb9-5dae-4adc-a795-d195acc0e06d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/kz88/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "import json\n", + "from convokit import Corpus\n", + "from tqdm import tqdm\n", + "import matplotlib.pyplot as plt\n", + "import ast\n", + "import nltk\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import sent_tokenize\n", + "import numpy as np\n", + "import re\n", + "import scipy.stats as stats\n", + "from itertools import combinations\n", + "import random\n", + "import string\n", + "import math\n", + "from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n", + "from scipy.spatial.distance import squareform\n", + "from sklearn.feature_extraction.text import CountVectorizer as CV\n", + "import string\n", + "\n", + "from convokit.genai.genai_config import GenAIConfigManager\n", + "from convokit.convo_similarity.scd import SCD\n", + "from convokit.convo_similarity.condyns import ConDynS" + ] + }, + { + "cell_type": "markdown", + "id": "eec494a0-dda0-496f-934f-c6ed99d7af81", + "metadata": {}, + "source": [ + "### Load WikiConvo German Conversations and Format It" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a700de8-f63f-4175-a321-ea3b2fc519b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### Setup path for data and corpus ###\n", + "\n", + "DATA_PATH = \"./data\"\n", + "filepath = DATA_PATH + \"PATH TO WIKI GERMAN DATA\"\n", + "\n", + "### Set up config for GenAI ###\n", + "config = GenAIConfigManager() ### make sure to set your own config if this is never set before\n", + "\n", + "### Select which model provider to use for ConDynS ###\n", + "MODEL_PROVIDER = \"gemini\"\n", + "MODEL = \"gemini-2.0-flash-001\"\n", + "config.set_google_cloud_config(\"YOUR PROJECT\", \"YOUR LOCATION\")\n", + "\n", + "with open(filepath, \"r\") as f:\n", + " dataset = json.load(f)\n", + "\n", + "random.seed(4300)\n", + "dataset = random.sample(dataset, 100)\n", + "len(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6d0f86ae-7cf0-4700-9ef1-4ce5a928095f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'convo_id': '64211111.132.132',\n", + " 'utterances': [{'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-06T17:49:41Z',\n", + " 'content': '== Schlitten-Standseilbahn => Schlittenseilbahn ==\\n',\n", + " 'cleaned_content': ' Schlitten-Standseilbahn => Schlittenseilbahn ',\n", + " 'replyTo_id': None,\n", + " 'page_id': '2160865',\n", + " 'indentation': -1,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'CREATION',\n", + " 'id': '64211111.132.132',\n", + " 'ancestor_id': '64211111.132.132',\n", + " 'rev_id': 64211111},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-06T17:49:41Z',\n", + " 'content': \"Ich habe mir erlaubt den Artikel nach ''Schlittenseilbahn'' zu verschieben. Ich bin einverstanden, dass [[Funi (Seilbahn)]] kein schlaues Lemma für einen Artikel ist, ''Schlitten-Standseilbahn'' aber auch nicht wirklich, aus den folgenden Gründen:\\n* Der Begriff taucht nur in Wikipedia-Artikeln auf, aber kaum in der freien Internetwildbahn.\\n* Ein Schlitten kann schlecht hängen.\\n* Nach alter deutscher Rechtschreibung müsste das Lemma eher zusammengeschrieben werden als ''Schlittenstandseilbahn''\\n* Der Hersteller der meisten (aller?) Anlagen ist die [[Bell Maschinenfabrik]], wo die Anlagen als ''Schlittenseilbahn'' bezeichnet wurden.\\nIch habe mir deshalb folgendes vorgehen erlaubt:\\n* ''Schlitten-Standseilbahn'' nach ''Schlittenseilbahn'' verschoben\\n* [[Funi (Seilbahn)]] wieder erstellt und als Weiterleitungsseite auf [[Standseilbahn]] versehen. ''Funi'' ist ein weit verbreiteter Helvetismus, der allgemein auf Standseilbahnen verwendet wird. -- [[Benutzer:Pechristener|Pechristener]] 19:49, 6. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \"Ich habe mir erlaubt den Artikel nach ''Schlittenseilbahn'' zu verschieben. Ich bin einverstanden, dass Funi (Seilbahn) kein schlaues Lemma für einen Artikel ist, ''Schlitten-Standseilbahn'' aber auch nicht wirklich, aus den folgenden Gründen:\\n Der Begriff taucht nur in Wikipedia-Artikeln auf, aber kaum in der freien Internetwildbahn.\\n Ein Schlitten kann schlecht hängen.\\n Nach alter deutscher Rechtschreibung müsste das Lemma eher zusammengeschrieben werden als ''Schlittenstandseilbahn''\\n Der Hersteller der meisten (aller?) Anlagen ist die Bell Maschinenfabrik, wo die Anlagen als ''Schlittenseilbahn'' bezeichnet wurden.\\nIch habe mir deshalb folgendes vorgehen erlaubt:\\n ''Schlitten-Standseilbahn'' nach ''Schlittenseilbahn'' verschoben\\n Funi (Seilbahn) wieder erstellt und als Weiterleitungsseite auf Standseilbahn versehen. ''Funi'' ist ein weit verbreiteter Helvetismus, der allgemein auf Standseilbahnen verwendet wird. Pechristener 19:49, 6. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64211111.132.132',\n", + " 'page_id': '2160865',\n", + " 'indentation': 0,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'ADDITION',\n", + " 'id': '64211111.183.132',\n", + " 'ancestor_id': '64211111.183.132',\n", + " 'rev_id': 64211111},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Albinfo',\n", + " 'timestamp': '2009-09-06T21:44:12Z',\n", + " 'content': \": Meines Erachtens ist nach wie vor ''Funi'' – mit welchem leider notwendigen Zusatz auch immer – nach wie vor das einzige zulässige Lemma. Ursprünglich wurde es unter diesem Lemma eingestellt. Das Wort Funi wurde dann aus dem Lemma entfernt mit der Begründung '' allgemeinverständlicheres Lemma''. Ein Lemma sollte aber nicht allgemeinverständlich und schon gar keine Erklärung des Begriffs sein, sondern den Begriff wiedergeben. Dass ''Funi'' derjenige Begriff ist, der Eingang in den allgemeinen Sprachgebrauch gefunden hat, zeigt sich in den angegbenen Quellen, wo ausschliesslich dieser verwendet wird, und auch im Text des Artikels selbst, wo trotz des geänderten Lemmas noch immer ''Funi'' verwendet wird (ohne dass sich jemand daran gestört hätte).\\n: Insofern bin ich auch nicht ganz einverstanden mit der Weiterleitung von [[Funi (Seilbahn)]] auf [[Standseilbahn]]. Es stimmt zwar, dass in gewissen Gegenden der Schweiz ''Funi'' als Bezeichnung einer lokalen Standseilbahn verwendet wird. Vielleicht bezeichnen einzelne Freiburger oder Berner im Dialekt damit sogar jede Art von Standseilbahn. Aber in beiden Fällen erfüllt der Begriff nicht das Erfordernis eines [[Helvetismus]] (''sprachliche Besonderheit, die typischerweise im Schweizer Hochdeutschen und nicht im gesamten deutschen Sprachgebiet verwendet wird'') – es ist undenkbar, dass zum Beispiel eine Zeitung den Begriff ''Funi'' als helvetische Alternative für ''Standseilbahn'' verwendet. Der Begriff ''Funi'' wurde aber schweizweit für „Schlittenseilbahnen“ verwendet und könnte durchaus so auch in einer Zeitung zu lesen sein. --[[Benutzer:Albinfo|Lars]] 23:44, 6. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \" Meines Erachtens ist nach wie vor ''Funi'' – mit welchem leider notwendigen Zusatz auch immer – nach wie vor das einzige zulässige Lemma. Ursprünglich wurde es unter diesem Lemma eingestellt. Das Wort Funi wurde dann aus dem Lemma entfernt mit der Begründung '' allgemeinverständlicheres Lemma''. Ein Lemma sollte aber nicht allgemeinverständlich und schon gar keine Erklärung des Begriffs sein, sondern den Begriff wiedergeben. Dass ''Funi'' derjenige Begriff ist, der Eingang in den allgemeinen Sprachgebrauch gefunden hat, zeigt sich in den angegbenen Quellen, wo ausschliesslich dieser verwendet wird, und auch im Text des Artikels selbst, wo trotz des geänderten Lemmas noch immer ''Funi'' verwendet wird (ohne dass sich jemand daran gestört hätte).\\n Insofern bin ich auch nicht ganz einverstanden mit der Weiterleitung von Funi (Seilbahn) auf Standseilbahn. Es stimmt zwar, dass in gewissen Gegenden der Schweiz ''Funi'' als Bezeichnung einer lokalen Standseilbahn verwendet wird. Vielleicht bezeichnen einzelne Freiburger oder Berner im Dialekt damit sogar jede Art von Standseilbahn. Aber in beiden Fällen erfüllt der Begriff nicht das Erfordernis eines Helvetismus (''sprachliche Besonderheit, die typischerweise im Schweizer Hochdeutschen und nicht im gesamten deutschen Sprachgebiet verwendet wird'') – es ist undenkbar, dass zum Beispiel eine Zeitung den Begriff ''Funi'' als helvetische Alternative für ''Standseilbahn'' verwendet. Der Begriff ''Funi'' wurde aber schweizweit für „Schlittenseilbahnen“ verwendet und könnte durchaus so auch in einer Zeitung zu lesen sein. Lars 23:44, 6. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64211111.183.132',\n", + " 'page_id': '2160865',\n", + " 'indentation': 1,\n", + " 'authors': ['2621:Albinfo'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '2621',\n", + " 'type': 'ADDITION',\n", + " 'id': '64220277.1204.1204',\n", + " 'ancestor_id': '64220277.1204.1204',\n", + " 'rev_id': 64220277},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-07T20:09:06Z',\n", + " 'content': \"::Ich habe nichts gegen das Lemma ''Funi'', aber aus meiner Sicht ist es nicht eindeutig eine Schlittenseilbahn, sonder kann irgendeine Standseilbahn sein. Ich nehme aber deine Anregung auf und habe aus der Weiterleitung eine Begriffserklärung gemacht. Siehe [[Funi (Seilbahn)|hier]].\\n::''Schlittenseilbahn'' ist aus meiner Sicht immer noch das richtige Lemma, denn Wikipedia findet für diesen Begriff ca. 900 Treffer. Für ''Funi'' befinden sich schon auf der ersten Seite auch Treffer, die sich auf ''Standseilbahn'' beziehen.\\n::Aus meiner Sicht erfüllt ''Funi'' das Kriterium für einen [[Helvetismus]] auf jeden Fall: Das 1. Kriterium weil es eben nur in der Schweiz gleichwertig für Standseilbahn verwendet wird, das 2. Kriterium auch weil es eine aus der Schweiz stammende Bezeichnung für eine Schlittenseilbahn ist.\\n::''Funi'' in Zeitungen kann sehr wohl vorkommen, wenn auch nicht wirklich häufig. Selbst die [[Neue Zürcher Zeitung]] kennt den Begriff:\\n:::''Wer mit dem Zug ankommt, nimmt das nächste Funi, die allzeit bereite Drahtseilbahn.'' Quelle: [http://www.nzz.ch/2004/03/25/to/article9hcp0_1.232755.html Lugano lebt auch hinter den Gleisen]\\n::Auch andere Texte benutzen ''Funi'' im Sinne von Standseilbahn, nicht nur für die Bahn in Freiburg. Beispiele sind auf diesen Seiten zu finden: [http://homepage.hispeed.ch/eric.sch/Actualites-d_08.htm 1], [http://www.austria-verein-freiburg-ch.com/50.html 2] und [http://www.misterx.ch/specials-gigathlon_07.htm 3].\\n::-- [[Benutzer:Pechristener|Pechristener]] 22:09, 7. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \"Ich habe nichts gegen das Lemma ''Funi'', aber aus meiner Sicht ist es nicht eindeutig eine Schlittenseilbahn, sonder kann irgendeine Standseilbahn sein. Ich nehme aber deine Anregung auf und habe aus der Weiterleitung eine Begriffserklärung gemacht. Siehe hier.\\n''Schlittenseilbahn'' ist aus meiner Sicht immer noch das richtige Lemma, denn Wikipedia findet für diesen Begriff ca. 900 Treffer. Für ''Funi'' befinden sich schon auf der ersten Seite auch Treffer, die sich auf ''Standseilbahn'' beziehen.\\nAus meiner Sicht erfüllt ''Funi'' das Kriterium für einen Helvetismus auf jeden Fall: Das 1. Kriterium weil es eben nur in der Schweiz gleichwertig für Standseilbahn verwendet wird, das 2. Kriterium auch weil es eine aus der Schweiz stammende Bezeichnung für eine Schlittenseilbahn ist.\\n''Funi'' in Zeitungen kann sehr wohl vorkommen, wenn auch nicht wirklich häufig. Selbst die Neue Zürcher Zeitung kennt den Begriff:\\n''Wer mit dem Zug ankommt, nimmt das nächste Funi, die allzeit bereite Drahtseilbahn.'' Quelle: Lugano lebt auch hinter den Gleisen\\nAuch andere Texte benutzen ''Funi'' im Sinne von Standseilbahn, nicht nur für die Bahn in Freiburg. Beispiele sind auf diesen Seiten zu finden: 1, 2 und 3.\\n Pechristener 22:09, 7. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64220277.1204.1204',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'ADDITION',\n", + " 'id': '64254817.2866.2866',\n", + " 'ancestor_id': '64254817.2866.2866',\n", + " 'rev_id': 64254817},\n", + " {'isUnchanged': True,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': '64254817.2866.2866',\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-07T20:10:27Z',\n", + " 'content': \"::Ich habe nichts gegen das Lemma ''Funi'', aber aus meiner Sicht ist es nicht eindeutig eine Schlittenseilbahn, sonder kann irgendeine Standseilbahn sein. Ich nehme aber deine Anregung auf und habe aus der Weiterleitung eine Begriffserklärung gemacht. Siehe [[Funi (Seilbahn)|hier]].\\n::''Schlittenseilbahn'' ist aus meiner Sicht immer noch das richtige Lemma, denn Wikipedia findet für diesen Begriff ca. 900 Treffer. Für ''Funi'' befinden sich schon auf der ersten Seite auch Treffer, die sich auf ''Standseilbahn'' beziehen.\\n::Aus meiner Sicht erfüllt ''Funi'' das Kriterium für einen [[Helvetismus]] auf jeden Fall: Das 1. Kriterium weil es eben nur in der Schweiz gleichwertig für Standseilbahn verwendet wird, das 2. Kriterium auch weil es eine aus der Schweiz stammende Bezeichnung für eine Schlittenseilbahn ist. Das heisst aber für mich nicht, dass er nicht in der Wikipedia auftauchen darf.\\n::''Funi'' in Zeitungen kann auch vorkommen, wenn auch nicht wirklich häufig. Selbst die [[Neue Zürcher Zeitung]] kennt den Begriff:\\n:::''Wer mit dem Zug ankommt, nimmt das nächste Funi, die allzeit bereite Drahtseilbahn.'' Quelle: [http://www.nzz.ch/2004/03/25/to/article9hcp0_1.232755.html Lugano lebt auch hinter den Gleisen]\\n::Auch andere Texte benutzen ''Funi'' im Sinne von Standseilbahn, nicht nur für die Bahn in Freiburg. Beispiele sind auf diesen Seiten zu finden: [http://homepage.hispeed.ch/eric.sch/Actualites-d_08.htm 1], [http://www.austria-verein-freiburg-ch.com/50.html 2] und [http://www.misterx.ch/specials-gigathlon_07.htm 3].\\n::-- [[Benutzer:Pechristener|Pechristener]] 22:09, 7. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \"Ich habe nichts gegen das Lemma ''Funi'', aber aus meiner Sicht ist es nicht eindeutig eine Schlittenseilbahn, sonder kann irgendeine Standseilbahn sein. Ich nehme aber deine Anregung auf und habe aus der Weiterleitung eine Begriffserklärung gemacht. Siehe hier.\\n''Schlittenseilbahn'' ist aus meiner Sicht immer noch das richtige Lemma, denn Wikipedia findet für diesen Begriff ca. 900 Treffer. Für ''Funi'' befinden sich schon auf der ersten Seite auch Treffer, die sich auf ''Standseilbahn'' beziehen.\\nAus meiner Sicht erfüllt ''Funi'' das Kriterium für einen Helvetismus auf jeden Fall: Das 1. Kriterium weil es eben nur in der Schweiz gleichwertig für Standseilbahn verwendet wird, das 2. Kriterium auch weil es eine aus der Schweiz stammende Bezeichnung für eine Schlittenseilbahn ist. Das heisst aber für mich nicht, dass er nicht in der Wikipedia auftauchen darf.\\n''Funi'' in Zeitungen kann auch vorkommen, wenn auch nicht wirklich häufig. Selbst die Neue Zürcher Zeitung kennt den Begriff:\\n''Wer mit dem Zug ankommt, nimmt das nächste Funi, die allzeit bereite Drahtseilbahn.'' Quelle: Lugano lebt auch hinter den Gleisen\\nAuch andere Texte benutzen ''Funi'' im Sinne von Standseilbahn, nicht nur für die Bahn in Freiburg. Beispiele sind auf diesen Seiten zu finden: 1, 2 und 3.\\n Pechristener 22:09, 7. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64220277.1204.1204',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'MODIFICATION',\n", + " 'id': '64254883.2866.2866',\n", + " 'ancestor_id': '64254817.2866.2866',\n", + " 'rev_id': 64254883},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': '84.73.134.144',\n", + " 'timestamp': '2009-09-08T05:35:58Z',\n", + " 'content': \"Funi wird im Schweizer HOCHDEUTSCHEN nicht allgemein für Standseilbahn verwendet, sondern bloss für bestimmte Bahnen in der West- und Südschweiz, deren französischer bzw. italienischer Name mit ''funi…'' beginnt und die im Volksmund als «Funi» bezeichnet werden. Im Dialekt mögen manche Leute zum Spass manche Standseilbahn als Funi bezeichnen. Schweizer Hochdeutsch ist das nicht. --[[Spezial:Beiträge/84.73.134.144|84.73.134.144]] 07:35, 8. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \"Funi wird im Schweizer HOCHDEUTSCHEN nicht allgemein für Standseilbahn verwendet, sondern bloss für bestimmte Bahnen in der West- und Südschweiz, deren französischer bzw. italienischer Name mit ''funi…'' beginnt und die im Volksmund als «Funi» bezeichnet werden. Im Dialekt mögen manche Leute zum Spass manche Standseilbahn als Funi bezeichnen. Schweizer Hochdeutsch ist das nicht. 84.73.134.144 07:35, 8. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64211111.132.132',\n", + " 'page_id': '2160865',\n", + " 'indentation': 0,\n", + " 'authors': ['ANONYMOUS:84.73.134.144'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': None,\n", + " 'type': 'ADDITION',\n", + " 'id': '64263837.4485.4485',\n", + " 'ancestor_id': '64263837.4485.4485',\n", + " 'rev_id': 64263837},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Albinfo',\n", + " 'timestamp': '2009-09-08T07:47:16Z',\n", + " 'content': ': Schliesse mich meine Vorredner an: Alle Textbelege, die du erwähnt hast, sprechen von \"dem Funi\" oder \"das Funi\". Es sind also ganz spezifische Bahnen gemeint. Aber das ist kein Schweizer Synonym für \\'\\'irgendeine beliebige Standseilbahn\\'\\' resp. \\'\\'Standseilbahnen im Allgemeinen\\'\\'. Alle Suchtreffer für \"ein Funi\" nehmen Bezug auf Schlittenseilbahnen. --[[Benutzer:Albinfo|Lars]] 09:47, 8. Sep. 2009 (CEST)\\n',\n", + " 'cleaned_content': ' Schliesse mich meine Vorredner an: Alle Textbelege, die du erwähnt hast, sprechen von \"dem Funi\" oder \"das Funi\". Es sind also ganz spezifische Bahnen gemeint. Aber das ist kein Schweizer Synonym für \\'\\'irgendeine beliebige Standseilbahn\\'\\' resp. \\'\\'Standseilbahnen im Allgemeinen\\'\\'. Alle Suchtreffer für \"ein Funi\" nehmen Bezug auf Schlittenseilbahnen. Lars 09:47, 8. Sep. 2009 (CEST)',\n", + " 'replyTo_id': '64263837.4485.4485',\n", + " 'page_id': '2160865',\n", + " 'indentation': 1,\n", + " 'authors': ['2621:Albinfo'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '2621',\n", + " 'type': 'ADDITION',\n", + " 'id': '64266509.4945.4945',\n", + " 'ancestor_id': '64266509.4945.4945',\n", + " 'rev_id': 64266509},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-08T20:08:03Z',\n", + " 'content': '::Ich kann mich den beiden Vorredner immer noch nicht anschliessen. \\'\\'Funi\\'\\' ist sicher ein [[Volksmund]]ausdruck, aber das schliesst nicht aus, dass es [[Hochdeutsch]] ist, denn der Ausdruck kommt auch als \\'\\'Standseilbahn\\'\\' dialetkunabhängig in den Medien zur Anwendung, einzig wie breit die Anwendung ist, kann noch diskutiert werden. Wenn der Volksmund keine Berechtigung in Wikipedia hätte, dann dürfte auch \\'\\'Funi\\'\\' für \\'\\'Schlittenseilbahn\\'\\' nicht als Lemma verwendet werden, denn in der von bereits von [[Benutzer:Albinfo|Lars]] zitierten [http://www.seilbahn-nostalgie.ch/ Seilbahn-Nostalgie] steht unter [http://www.seilbahn-nostalgie.ch/skilifte.html Schlittenseilbahnen und frühe Skilifte in der Schweiz] geschrieben: \\'\\'... Schlittenseilbahnen, im Volksmund auch Funis genannt (\"Funi\" ist die Abkürzung für das französische \"Funiculaire\" = Standseilbahn)\\'\\' oder auch bei [http://jwalker.ch/funi/home-d.htm Funi Wildhaus] \\'\\'Funi (Kurzform von «Funiculaire»)\\'\\'.\\n::Bahnen mit \\'\\'Funi..\\'\\' im Namen werden eher mit \\'\\'Funi\\'\\' bezeichnet, aber es gibt auch andere Beispiele. Eines befindet sich sogar auf der ersten Google Seite für die Suche nach [http://www.google.ch/search?rlz=1C1CHMB_deCH307CH307&sourceid=chrome&ie=UTF-8&q=\"ein+funi\"] und zwar dieses [http://www.travelblog.org/South-America/Chile/Valparaiso-Region/Valparaiso/blog-103731.html hier] für eine Standseilbahn in Chile, die übrigens noch in der [[Liste der Standseilbahnen]] fehlt. Auch \\'\\'Funiculair\\'\\' wird als allgemeiner Begriff verwendet, wie z.B. [http://meinkiew.blogspot.com/2006/09/kiewer-funiculaire.html hier]-- [[Benutzer:Pechristener|Pechristener]] 22:08, 8. Sep. 2009 (CEST)\\n',\n", + " 'cleaned_content': 'Ich kann mich den beiden Vorredner immer noch nicht anschliessen. \\'\\'Funi\\'\\' ist sicher ein Volksmundausdruck, aber das schliesst nicht aus, dass es Hochdeutsch ist, denn der Ausdruck kommt auch als \\'\\'Standseilbahn\\'\\' dialetkunabhängig in den Medien zur Anwendung, einzig wie breit die Anwendung ist, kann noch diskutiert werden. Wenn der Volksmund keine Berechtigung in Wikipedia hätte, dann dürfte auch \\'\\'Funi\\'\\' für \\'\\'Schlittenseilbahn\\'\\' nicht als Lemma verwendet werden, denn in der von bereits von Lars zitierten Seilbahn-Nostalgie steht unter Schlittenseilbahnen und frühe Skilifte in der Schweiz geschrieben: \\'\\'... Schlittenseilbahnen, im Volksmund auch Funis genannt (\"Funi\" ist die Abkürzung für das französische \"Funiculaire\" = Standseilbahn)\\'\\' oder auch bei Funi Wildhaus \\'\\'Funi (Kurzform von «Funiculaire»)\\'\\'.\\nBahnen mit \\'\\'Funi..\\'\\' im Namen werden eher mit \\'\\'Funi\\'\\' bezeichnet, aber es gibt auch andere Beispiele. Eines befindet sich sogar auf der ersten Google Seite für die Suche nach und zwar dieses hier für eine Standseilbahn in Chile, die übrigens noch in der Liste der Standseilbahnen fehlt. Auch \\'\\'Funiculair\\'\\' wird als allgemeiner Begriff verwendet, wie z.B. hier Pechristener 22:08, 8. Sep. 2009 (CEST)',\n", + " 'replyTo_id': '64266509.4945.4945',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'ADDITION',\n", + " 'id': '64304109.5353.5353',\n", + " 'ancestor_id': '64304109.5353.5353',\n", + " 'rev_id': 64304109},\n", + " {'isUnchanged': True,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': '64304109.5353.5353',\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-08T20:10:00Z',\n", + " 'content': '::Ich kann mich den beiden Vorredner immer noch nicht anschliessen. \\'\\'Funi\\'\\' ist sicher ein [[Volksmund]]ausdruck, aber das schliesst nicht aus, dass es [[Hochdeutsch]] ist, denn der Ausdruck kommt auch als \\'\\'Standseilbahn\\'\\' dialetkunabhängig in den Medien zur Anwendung, einzig wie breit die Anwendung ist, kann noch diskutiert werden. Wenn der Volksmund keine Berechtigung in Wikipedia hätte, dann dürfte auch \\'\\'Funi\\'\\' für \\'\\'Schlittenseilbahn\\'\\' nicht als Lemma verwendet werden, denn in der bereits von [[Benutzer:Albinfo|Lars]] zitierten [http://www.seilbahn-nostalgie.ch/ Seilbahn-Nostalgie] steht unter [http://www.seilbahn-nostalgie.ch/skilifte.html Schlittenseilbahnen und frühe Skilifte in der Schweiz] geschrieben: \\'\\'... Schlittenseilbahnen, im Volksmund auch Funis genannt (\"Funi\" ist die Abkürzung für das französische \"Funiculaire\" = Standseilbahn)\\'\\' oder auch bei [http://jwalker.ch/funi/home-d.htm Funi Wildhaus] \\'\\'Funi (Kurzform von «Funiculaire»)\\'\\'.\\n::Bahnen mit \\'\\'Funi..\\'\\' im Namen werden eher mit \\'\\'Funi\\'\\' bezeichnet, aber es gibt auch andere Beispiele. Eines befindet sich sogar auf der ersten Google Seite für die Suche nach [http://www.google.ch/search?rlz=1C1CHMB_deCH307CH307&sourceid=chrome&ie=UTF-8&q=\"ein+funi\"] und zwar dieses [http://www.travelblog.org/South-America/Chile/Valparaiso-Region/Valparaiso/blog-103731.html hier] für eine Standseilbahn in Chile, die übrigens noch in der [[Liste der Standseilbahnen]] fehlt. Auch \\'\\'Funiculair\\'\\' wird als allgemeiner Begriff verwendet, wie z.B. [http://meinkiew.blogspot.com/2006/09/kiewer-funiculaire.html hier]-- [[Benutzer:Pechristener|Pechristener]] 22:08, 8. Sep. 2009 (CEST)\\n',\n", + " 'cleaned_content': 'Ich kann mich den beiden Vorredner immer noch nicht anschliessen. \\'\\'Funi\\'\\' ist sicher ein Volksmundausdruck, aber das schliesst nicht aus, dass es Hochdeutsch ist, denn der Ausdruck kommt auch als \\'\\'Standseilbahn\\'\\' dialetkunabhängig in den Medien zur Anwendung, einzig wie breit die Anwendung ist, kann noch diskutiert werden. Wenn der Volksmund keine Berechtigung in Wikipedia hätte, dann dürfte auch \\'\\'Funi\\'\\' für \\'\\'Schlittenseilbahn\\'\\' nicht als Lemma verwendet werden, denn in der bereits von Lars zitierten Seilbahn-Nostalgie steht unter Schlittenseilbahnen und frühe Skilifte in der Schweiz geschrieben: \\'\\'... Schlittenseilbahnen, im Volksmund auch Funis genannt (\"Funi\" ist die Abkürzung für das französische \"Funiculaire\" = Standseilbahn)\\'\\' oder auch bei Funi Wildhaus \\'\\'Funi (Kurzform von «Funiculaire»)\\'\\'.\\nBahnen mit \\'\\'Funi..\\'\\' im Namen werden eher mit \\'\\'Funi\\'\\' bezeichnet, aber es gibt auch andere Beispiele. Eines befindet sich sogar auf der ersten Google Seite für die Suche nach und zwar dieses hier für eine Standseilbahn in Chile, die übrigens noch in der Liste der Standseilbahnen fehlt. Auch \\'\\'Funiculair\\'\\' wird als allgemeiner Begriff verwendet, wie z.B. hier Pechristener 22:08, 8. Sep. 2009 (CEST)',\n", + " 'replyTo_id': '64266509.4945.4945',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'MODIFICATION',\n", + " 'id': '64304181.5353.5353',\n", + " 'ancestor_id': '64304109.5353.5353',\n", + " 'rev_id': 64304181},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Albinfo',\n", + " 'timestamp': '2009-09-09T09:24:46Z',\n", + " 'content': ': Wir müssen unterscheiden zwischen \\'\\'Volksmund\\'\\' (im Sinne von Helvetismus, der ins Hochdeutsche Eingang gefunden hat) und Schweizerdeutsch. Dass zwei Schweizer in ihren hochdeutschen Texten den Begriff verwenden, macht noch nicht zwingend einen Helvetismus. Und dass jemand \"im Volksmus auch Funis genannt\" schreibt, macht das Wort noch nicht zum schweizerdeutschen Begriff. Ich bin nach wie vor der Ansicht, dass \\'\\'Funi\\'\\' die gebräuchlichste Bezeichnung für \\'\\'Schlittenseilbahnen\\'\\' ist (eine Volksmundbezeichnung lässt sich durch die sehr regioinale Verwendung dieser Geräte gut rechtfertigen), und dass \\'\\'Funi\\'\\' kein Helvetismus ist, weil im allgemeinen Hochdeutsch-Gebrauch von Schweizern (trotz zwei oder drei Ausnahmen) dieses Wort nicht den Standard-Ausdruck Standseilbahn ersetzt hat (und im Gegensatz zu vielen Helvetismen auch keinen Eingang in den Duden gefunden hat). --[[Benutzer:Albinfo|Lars]] 11:24, 9. Sep. 2009 (CEST)\\n',\n", + " 'cleaned_content': ' Wir müssen unterscheiden zwischen \\'\\'Volksmund\\'\\' (im Sinne von Helvetismus, der ins Hochdeutsche Eingang gefunden hat) und Schweizerdeutsch. Dass zwei Schweizer in ihren hochdeutschen Texten den Begriff verwenden, macht noch nicht zwingend einen Helvetismus. Und dass jemand \"im Volksmus auch Funis genannt\" schreibt, macht das Wort noch nicht zum schweizerdeutschen Begriff. Ich bin nach wie vor der Ansicht, dass \\'\\'Funi\\'\\' die gebräuchlichste Bezeichnung für \\'\\'Schlittenseilbahnen\\'\\' ist (eine Volksmundbezeichnung lässt sich durch die sehr regioinale Verwendung dieser Geräte gut rechtfertigen), und dass \\'\\'Funi\\'\\' kein Helvetismus ist, weil im allgemeinen Hochdeutsch-Gebrauch von Schweizern (trotz zwei oder drei Ausnahmen) dieses Wort nicht den Standard-Ausdruck Standseilbahn ersetzt hat (und im Gegensatz zu vielen Helvetismen auch keinen Eingang in den Duden gefunden hat). Lars 11:24, 9. Sep. 2009 (CEST)',\n", + " 'replyTo_id': '64263837.4485.4485',\n", + " 'page_id': '2160865',\n", + " 'indentation': 1,\n", + " 'authors': ['2621:Albinfo'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '2621',\n", + " 'type': 'ADDITION',\n", + " 'id': '64317601.7006.7006',\n", + " 'ancestor_id': '64317601.7006.7006',\n", + " 'rev_id': 64317601},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Micha',\n", + " 'timestamp': '2009-09-09T09:35:19Z',\n", + " 'content': \":: «Schlittenseilbahn» oder «Schlitten-Standseilbahn» ist Theoriefindung ([[WP:TF]]) einiger weniger Leute, die offenbar der Sinn eines Lemmas nicht verstanden habe. Einfach wieder zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. --[[Benutzer:Micha L. Rieser|Micha]] 11:35, 9. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \" «Schlittenseilbahn» oder «Schlitten-Standseilbahn» ist Theoriefindung (WP:TF) einiger weniger Leute, die offenbar der Sinn eines Lemmas nicht verstanden habe. Einfach wieder zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. Micha 11:35, 9. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64317601.7006.7006',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['420816:Micha'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '420816',\n", + " 'type': 'ADDITION',\n", + " 'id': '64318003.7942.7942',\n", + " 'ancestor_id': '64318003.7942.7942',\n", + " 'rev_id': 64318003},\n", + " {'isUnchanged': True,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': '64318003.7942.7942',\n", + " 'user_text': 'Micha',\n", + " 'timestamp': '2009-09-09T09:36:06Z',\n", + " 'content': \":: «Schlittenseilbahn» oder «Schlitten-Standseilbahn» ist Theoriefindung (Siehe auch Eiineitung von [[WP:TF]]) einiger weniger Leute, die offenbar der Sinn eines Lemmas nicht verstanden habe. Einfach wieder zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. --[[Benutzer:Micha L. Rieser|Micha]] 11:35, 9. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \" «Schlittenseilbahn» oder «Schlitten-Standseilbahn» ist Theoriefindung (Siehe auch Eiineitung von WP:TF) einiger weniger Leute, die offenbar der Sinn eines Lemmas nicht verstanden habe. Einfach wieder zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. Micha 11:35, 9. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64317601.7006.7006',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['420816:Micha'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '420816',\n", + " 'type': 'MODIFICATION',\n", + " 'id': '64318025.7942.7942',\n", + " 'ancestor_id': '64318003.7942.7942',\n", + " 'rev_id': 64318025},\n", + " {'isUnchanged': True,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': '64318025.7942.7942',\n", + " 'user_text': 'Micha',\n", + " 'timestamp': '2009-09-09T09:36:21Z',\n", + " 'content': \":: «Schlittenseilbahn» oder «Schlitten-Standseilbahn» ist Theoriefindung (Siehe auch Einleitung von [[WP:TF]]) einiger weniger Leute, die offenbar der Sinn eines Lemmas nicht verstanden habe. Einfach wieder zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. --[[Benutzer:Micha L. Rieser|Micha]] 11:35, 9. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \" «Schlittenseilbahn» oder «Schlitten-Standseilbahn» ist Theoriefindung (Siehe auch Einleitung von WP:TF) einiger weniger Leute, die offenbar der Sinn eines Lemmas nicht verstanden habe. Einfach wieder zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. Micha 11:35, 9. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64317601.7006.7006',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['420816:Micha'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '420816',\n", + " 'type': 'MODIFICATION',\n", + " 'id': '64318035.7942.7942',\n", + " 'ancestor_id': '64318003.7942.7942',\n", + " 'rev_id': 64318035},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Bobo11',\n", + " 'timestamp': '2009-09-09T19:06:55Z',\n", + " 'content': \":::Also in der Schweiz, und der Artikel behandlet nun mal nur Anlagen in der Schweiz, ist ''Funi'' der verbreiteste Name dafür. Schlittenseilbahn und Schlitten-Standseilbahn sind so gut wie unbekannt. Zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. -- [[Benutzer:Bobo11|Bobo11]] 21:06, 9. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \"Also in der Schweiz, und der Artikel behandlet nun mal nur Anlagen in der Schweiz, ist ''Funi'' der verbreiteste Name dafür. Schlittenseilbahn und Schlitten-Standseilbahn sind so gut wie unbekannt. Zurückverschieben auf ''Funi (Seilbahn)'' und gut ist. Bobo11 21:06, 9. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64318035.7942.7942',\n", + " 'page_id': '2160865',\n", + " 'indentation': 3,\n", + " 'authors': ['112847:Bobo11'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '112847',\n", + " 'type': 'ADDITION',\n", + " 'id': '64337479.8268.8268',\n", + " 'ancestor_id': '64337479.8268.8268',\n", + " 'rev_id': 64337479},\n", + " {'isUnchanged': True,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': '64317601.7006.7006',\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-10T05:14:28Z',\n", + " 'content': ': Wir müssen unterscheiden zwischen \\'\\'Volksmund\\'\\' (im Sinne von Helvetismus, der ins Hochdeutsche Eingang gefunden hat) und Schweizerdeutsch. Dass zwei Schweizer in ihren hochdeutschen Texten den Begriff verwenden, macht noch nicht zwingend einen Helvetismus. Und dass jemand \"im Volksmund auch Funis genannt\" schreibt, macht das Wort noch nicht zum schweizerdeutschen Begriff. Ich bin nach wie vor der Ansicht, dass \\'\\'Funi\\'\\' die gebräuchlichste Bezeichnung für \\'\\'Schlittenseilbahnen\\'\\' ist (eine Volksmundbezeichnung lässt sich durch die sehr regioinale Verwendung dieser Geräte gut rechtfertigen), und dass \\'\\'Funi\\'\\' kein Helvetismus ist, weil im allgemeinen Hochdeutsch-Gebrauch von Schweizern (trotz zwei oder drei Ausnahmen) dieses Wort nicht den Standard-Ausdruck Standseilbahn ersetzt hat (und im Gegensatz zu vielen Helvetismen auch keinen Eingang in den Duden gefunden hat). --[[Benutzer:Albinfo|Lars]] 11:24, 9. Sep. 2009 (CEST)\\n',\n", + " 'cleaned_content': ' Wir müssen unterscheiden zwischen \\'\\'Volksmund\\'\\' (im Sinne von Helvetismus, der ins Hochdeutsche Eingang gefunden hat) und Schweizerdeutsch. Dass zwei Schweizer in ihren hochdeutschen Texten den Begriff verwenden, macht noch nicht zwingend einen Helvetismus. Und dass jemand \"im Volksmund auch Funis genannt\" schreibt, macht das Wort noch nicht zum schweizerdeutschen Begriff. Ich bin nach wie vor der Ansicht, dass \\'\\'Funi\\'\\' die gebräuchlichste Bezeichnung für \\'\\'Schlittenseilbahnen\\'\\' ist (eine Volksmundbezeichnung lässt sich durch die sehr regioinale Verwendung dieser Geräte gut rechtfertigen), und dass \\'\\'Funi\\'\\' kein Helvetismus ist, weil im allgemeinen Hochdeutsch-Gebrauch von Schweizern (trotz zwei oder drei Ausnahmen) dieses Wort nicht den Standard-Ausdruck Standseilbahn ersetzt hat (und im Gegensatz zu vielen Helvetismen auch keinen Eingang in den Duden gefunden hat). Lars 11:24, 9. Sep. 2009 (CEST)',\n", + " 'replyTo_id': '64263837.4485.4485',\n", + " 'page_id': '2160865',\n", + " 'indentation': 1,\n", + " 'authors': ['2621:Albinfo', '5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'MODIFICATION',\n", + " 'id': '64349198.7006.7006',\n", + " 'ancestor_id': '64317601.7006.7006',\n", + " 'rev_id': 64349198},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-10T05:14:28Z',\n", + " 'content': ':Gegen \\'\\'Funi\\'\\' als gebräuchlichste Bezeichnung für \\'\\'Schlittenseilbahn\\'\\' habe ich nichts, aber gegen \\'\\'Funi\\'\\' als [[Eindeutigkeit|eindeutig]]er Begriff, so dass er als Lemma ungeeignet ist oder zumindest mit den Weiterleitungen auf die anderen Verwendungen versehen sein müsste.\\n:\\'\\'Funi\\'\\' ist [[Volksmund]] im Sinne von [[Umgangssprache]], einige Leute sind aber durchaus der Meinung, dass \\'\\'Funi\\'\\' Schweizerdeutsch sein könnte, denn sie setzen den Begriff in Anführungszeichen, wenn sie ihn in hochdeutschen Texten verwenden. Die von mir zitierten Quellen, wo Funi für anderes als \\'\\'Schlittenseilbahn\\'\\' verwendet wird, sind als hochdeutsch zu werten, weil der Begriff dort nirgends in Anführungszeichen auftaucht und der restliche Text nicht auf Schweizerdeutsch geschriebene ist.\\n:\\'\\'Funi\\'\\' ist trotzdem ein Helvetismus, auch wenn es nicht der Standardausdruck ist, denn das ist kein Kriterimu für \\'\\'Helvetismus\\'\\'. Es sagen ja auch nicht alle Schweizer \\'\\'Anken\\'\\', was gleichzeitig auch beweist, dass ein Mundartbegriff trotzdem ein Helvetismus sein kann. Der Jemand, der \"im Volksmund auch Funis genannt\" schreibt, scheint sich mit dem Thema ziemlich intensiv befasst zu haben und ist deshalb auch entsprechend zu werden, ebenso die Quelle aus der NZZ, welche auf die korrekte Sprache sehr viel Wert legt.\\n: Weshalb \\'\\'Schlittenseilbahn\\'\\' WP:TF sein sollte, ist mir nicht klar. Der Begriff ist eindeutig und erzeugt in Google immerhin 500 eindeutige Suchresultate. Weitere Begründungen, weshalb dieses Lemma gewählt wurde, siehe zu oberst in der Diskussion. \\'\\'Schlitten-Standseilbahn\\'\\' ist auch in meiner Meinung WP:TF.\\n: Es stimmt nicht, dass \\'\\'Funi\\'\\' nicht im Duden ist. \\'\\'Duden - Das große Fremdwörterbuch: Herkunft und Bedeutung der Fremdwörter. 4., aktualisierte Auflage Mannheim, Leipzig, Wien, Zürich: Dudenverlag 2007.\\'\\' weiss dazu: \\'\\'Funi\\'\\': Kurzform von \\'\\'Skifuni\\'\\', ausserdem kennt er auch noch \\'\\'Funiculaire [fynikl:] das; -[s], -s ‹aus gleichbed. fr. funiculaire zu lat. funiculus, vgl. Funikulus›: (veraltet) Drahtseilbahn.\\'\\' Da \\'\\'Funi\\'\\' auch hier offensichtlich eine Kurzform ist, stellt sich die Frage, für was alles denn Funi eine Kurform ist und da kommt eben auch \\'\\'Funiculaire\\'\\' hinzu. -- [[Benutzer:Pechristener|Pechristener]] 07:14, 10. Sep. 2009 (CEST)\\n',\n", + " 'cleaned_content': 'Gegen \\'\\'Funi\\'\\' als gebräuchlichste Bezeichnung für \\'\\'Schlittenseilbahn\\'\\' habe ich nichts, aber gegen \\'\\'Funi\\'\\' als eindeutiger Begriff, so dass er als Lemma ungeeignet ist oder zumindest mit den Weiterleitungen auf die anderen Verwendungen versehen sein müsste.\\n\\'\\'Funi\\'\\' ist Volksmund im Sinne von Umgangssprache, einige Leute sind aber durchaus der Meinung, dass \\'\\'Funi\\'\\' Schweizerdeutsch sein könnte, denn sie setzen den Begriff in Anführungszeichen, wenn sie ihn in hochdeutschen Texten verwenden. Die von mir zitierten Quellen, wo Funi für anderes als \\'\\'Schlittenseilbahn\\'\\' verwendet wird, sind als hochdeutsch zu werten, weil der Begriff dort nirgends in Anführungszeichen auftaucht und der restliche Text nicht auf Schweizerdeutsch geschriebene ist.\\n\\'\\'Funi\\'\\' ist trotzdem ein Helvetismus, auch wenn es nicht der Standardausdruck ist, denn das ist kein Kriterimu für \\'\\'Helvetismus\\'\\'. Es sagen ja auch nicht alle Schweizer \\'\\'Anken\\'\\', was gleichzeitig auch beweist, dass ein Mundartbegriff trotzdem ein Helvetismus sein kann. Der Jemand, der \"im Volksmund auch Funis genannt\" schreibt, scheint sich mit dem Thema ziemlich intensiv befasst zu haben und ist deshalb auch entsprechend zu werden, ebenso die Quelle aus der NZZ, welche auf die korrekte Sprache sehr viel Wert legt.\\n Weshalb \\'\\'Schlittenseilbahn\\'\\' WP:TF sein sollte, ist mir nicht klar. Der Begriff ist eindeutig und erzeugt in Google immerhin 500 eindeutige Suchresultate. Weitere Begründungen, weshalb dieses Lemma gewählt wurde, siehe zu oberst in der Diskussion. \\'\\'Schlitten-Standseilbahn\\'\\' ist auch in meiner Meinung WP:TF.\\n Es stimmt nicht, dass \\'\\'Funi\\'\\' nicht im Duden ist. \\'\\'Duden - Das große Fremdwörterbuch: Herkunft und Bedeutung der Fremdwörter. 4., aktualisierte Auflage Mannheim, Leipzig, Wien, Zürich: Dudenverlag 2007.\\'\\' weiss dazu: \\'\\'Funi\\'\\': Kurzform von \\'\\'Skifuni\\'\\', ausserdem kennt er auch noch \\'\\'Funiculaire [fynikl:] das; -[s], -s ‹aus gleichbed. fr. funiculaire zu lat. funiculus, vgl. Funikulus›: (veraltet) Drahtseilbahn.\\'\\' Da \\'\\'Funi\\'\\' auch hier offensichtlich eine Kurzform ist, stellt sich die Frage, für was alles denn Funi eine Kurform ist und da kommt eben auch \\'\\'Funiculaire\\'\\' hinzu. Pechristener 07:14, 10. Sep. 2009 (CEST)',\n", + " 'replyTo_id': '64263837.4485.4485',\n", + " 'page_id': '2160865',\n", + " 'indentation': 1,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'ADDITION',\n", + " 'id': '64349198.8582.8581',\n", + " 'ancestor_id': '64349198.8582.8581',\n", + " 'rev_id': 64349198},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Micha',\n", + " 'timestamp': '2009-09-10T14:20:25Z',\n", + " 'content': \":: Ein Lemma ist aber ein ''Nachschlagebegriff''. ''Das'', was der Benutzer ''nachschlägt''. Und nicht etwa ein Artikeltitel. Das wird leider immer wieder verwechselt. Wenn also die meisten Leute hier «Funi» nachschlagen, dann ist er auch das korrekte Lemma für den Artikel. Wir sind hier immer noch ein enzyklopädisches ''Nachschlagewerk'' und nicht etwa eine Sammlung von wissenschaftl. Artikel. --[[Benutzer:Micha L. Rieser|Micha]] 16:20, 10. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \" Ein Lemma ist aber ein ''Nachschlagebegriff''. ''Das'', was der Benutzer ''nachschlägt''. Und nicht etwa ein Artikeltitel. Das wird leider immer wieder verwechselt. Wenn also die meisten Leute hier «Funi» nachschlagen, dann ist er auch das korrekte Lemma für den Artikel. Wir sind hier immer noch ein enzyklopädisches ''Nachschlagewerk'' und nicht etwa eine Sammlung von wissenschaftl. Artikel. Micha 16:20, 10. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64349198.8582.8581',\n", + " 'page_id': '2160865',\n", + " 'indentation': 2,\n", + " 'authors': ['420816:Micha'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '420816',\n", + " 'type': 'ADDITION',\n", + " 'id': '64365055.10860.10860',\n", + " 'ancestor_id': '64365055.10860.10860',\n", + " 'rev_id': 64365055},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-10T20:37:26Z',\n", + " 'content': \"Micha, ich bin mit dir einverstanden oder habe ich dich doch nicht verstanden?\\nDas Lemma [[Funi]] ist vorhanden und kann somit auch nachgeschlagen werden. Da der Begriff nicht eindeutig ist, hat schon jemand vor mir nach [[Wikipedia:Begriffsklärung|WP:BKL]] eine Begriffserklärungsseite gemacht und diesen Artikel als ''Funi (Seilbahn)'' angelegt. Dort stand früher zuoberst drin ''Funi (frz.) ist die Abkürzung von Funiculaire, Standseilbahn.'' und es folgte eine Beschreibung von ''Schlittenseilbahn'' und die Seilbahn [[Funiculaire Neuveville–St.Pierre]] beschrieben. Damit nicht zwei Begriffe im gleichen Artikel vorkommen, wurde aua dem Teil ''Funi-Schlitten - ein Unikum'' der Artikel [[Schlitten-Standseilbahn]]. Das fand alles im Dezember 2008 statt, war nicht von mir und hat niemanden gestört bis ich aus dem WP:TF Begriff ''Schlitten-Standseilbahn'' den gebräuchlichen und eindeutigen Begriff ''Schlittenseilbahn'' für den Artikeltitel gewählt habe, worauf dann Meinungen kamen, dass ''Funi (Seilbahn)'' der einzige richtige Artikeltitel wäre, der aber gar nicht eindeutig ist.\\n[http://www.google.ch/search?rlz=1C1CHMB_deCH307CH307&sourceid=chrome&ie=UTF-8&q=funi+seilbahn Funi (Seilbahn)] liefert bei Google schon auf der ersten Suchresultat Link zu [[Funiculaire Neuveville–St.Pierre]], Seiten zu anderen Themen und Seiten wo ''Funi'' nur in Anführungszeichen vorkommt. [http://www.google.ch/search?hl=de&rlz=1C1CHMB_deCH307CH307&q=schlittenseilbahn&btnG=Suche&meta= Schlittenseilbahn] liefert 450 Treffer, die nur das Thema dieses Artikels behandeln.\\nHier noch etwas Statistik zu den Wikipedia Artikeln mit [http://stats.grok.se/ Wikipedia article traffic statistics] [[Schlittenseilbahn]] und [[Schlitten-Standseilbahn]] wurde zusammen 76-mal angeschaut im September 2009, [[Funi]] 27-mal, [[Funi (Seilbahn)]] 15-mal und [[Funi-Schlitten]] 3-mal, wir wissen allerdings nicht, was die Leute sehen wollten. -- [[Benutzer:Pechristener|Pechristener]] 22:37, 10. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \"Micha, ich bin mit dir einverstanden oder habe ich dich doch nicht verstanden?\\nDas Lemma Funi ist vorhanden und kann somit auch nachgeschlagen werden. Da der Begriff nicht eindeutig ist, hat schon jemand vor mir nach WP:BKL eine Begriffserklärungsseite gemacht und diesen Artikel als ''Funi (Seilbahn)'' angelegt. Dort stand früher zuoberst drin ''Funi (frz.) ist die Abkürzung von Funiculaire, Standseilbahn.'' und es folgte eine Beschreibung von ''Schlittenseilbahn'' und die Seilbahn Funiculaire Neuveville–St.Pierre beschrieben. Damit nicht zwei Begriffe im gleichen Artikel vorkommen, wurde aua dem Teil ''Funi-Schlitten - ein Unikum'' der Artikel Schlitten-Standseilbahn. Das fand alles im Dezember 2008 statt, war nicht von mir und hat niemanden gestört bis ich aus dem WP:TF Begriff ''Schlitten-Standseilbahn'' den gebräuchlichen und eindeutigen Begriff ''Schlittenseilbahn'' für den Artikeltitel gewählt habe, worauf dann Meinungen kamen, dass ''Funi (Seilbahn)'' der einzige richtige Artikeltitel wäre, der aber gar nicht eindeutig ist.\\nFuni (Seilbahn) liefert bei Google schon auf der ersten Suchresultat Link zu Funiculaire Neuveville–St.Pierre, Seiten zu anderen Themen und Seiten wo ''Funi'' nur in Anführungszeichen vorkommt. Schlittenseilbahn liefert 450 Treffer, die nur das Thema dieses Artikels behandeln.\\nHier noch etwas Statistik zu den Wikipedia Artikeln mit Wikipedia article traffic statistics Schlittenseilbahn und Schlitten-Standseilbahn wurde zusammen 76-mal angeschaut im September 2009, Funi 27-mal, Funi (Seilbahn) 15-mal und Funi-Schlitten 3-mal, wir wissen allerdings nicht, was die Leute sehen wollten. Pechristener 22:37, 10. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64211111.132.132',\n", + " 'page_id': '2160865',\n", + " 'indentation': 0,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'ADDITION',\n", + " 'id': '64377492.11323.11323',\n", + " 'ancestor_id': '64377492.11323.11323',\n", + " 'rev_id': 64377492},\n", + " {'isUnchanged': True,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': '64377492.11323.11323',\n", + " 'user_text': 'Pechristener',\n", + " 'timestamp': '2009-09-11T20:57:07Z',\n", + " 'content': \"Micha, ich bin mit dir einverstanden oder habe ich dich doch nicht verstanden?\\nDas Lemma [[Funi]] ist vorhanden und kann somit auch nachgeschlagen werden. Da der Begriff nicht eindeutig ist, hat schon jemand vor mir nach [[Wikipedia:Begriffsklärung|WP:BKL]] eine Begriffserklärungsseite gemacht und diesen Artikel als ''Funi (Seilbahn)'' angelegt. Dort stand [http://de.wikipedia.org/w/index.php?title=Schlittenseilbahn&oldid=53727936 früher] zuoberst drin ''Funi (frz.) ist die Abkürzung von Funiculaire, Standseilbahn.'' und es folgte eine Beschreibung von ''Schlittenseilbahn'' und der Seilbahn [[Funiculaire Neuveville–St.Pierre]]. Damit nicht zwei Begriffe im gleichen Artikel vorkommen, wurde aus dem Teil ''Funi-Schlitten - ein Unikum'' der Artikel [[Schlitten-Standseilbahn]] und aus dem anderen [[Wasserballastbahn]]. Das fand alles im Dezember 2008 statt, war nicht von mir und hat niemanden gestört bis ich aus dem WP:TF Begriff ''Schlitten-Standseilbahn'' den gebräuchlichen und eindeutigen Begriff ''Schlittenseilbahn'' für den Artikeltitel gewählt habe, worauf dann Meinungen kamen, dass ''Funi (Seilbahn)'' der einzige richtige Artikeltitel wäre, der aber gar nicht eindeutig ist.\\n[http://www.google.ch/search?rlz=1C1CHMB_deCH307CH307&sourceid=chrome&ie=UTF-8&q=funi+seilbahn Funi (Seilbahn)] liefert bei Google schon auf der ersten Suchresultat Link zu [[Funiculaire Neuveville–St.Pierre]], Seiten zu anderen Themen und Seiten wo ''Funi'' nur in Anführungszeichen vorkommt. [http://www.google.ch/search?hl=de&rlz=1C1CHMB_deCH307CH307&q=schlittenseilbahn&btnG=Suche&meta= Schlittenseilbahn] liefert 450 Treffer, die nur das Thema dieses Artikels behandeln.\\nHier noch etwas Statistik zu den Wikipedia Artikeln mit [http://stats.grok.se/ Wikipedia article traffic statistics] [[Schlittenseilbahn]] und [[Schlitten-Standseilbahn]] wurde zusammen 76-mal angeschaut im September 2009, [[Funi]] 27-mal, [[Funi (Seilbahn)]] 15-mal und [[Funi-Schlitten]] 3-mal, wir wissen allerdings nicht, was die Leute sehen wollten. -- [[Benutzer:Pechristener|Pechristener]] 22:37, 10. Sep. 2009 (CEST)\\n\",\n", + " 'cleaned_content': \"Micha, ich bin mit dir einverstanden oder habe ich dich doch nicht verstanden?\\nDas Lemma Funi ist vorhanden und kann somit auch nachgeschlagen werden. Da der Begriff nicht eindeutig ist, hat schon jemand vor mir nach WP:BKL eine Begriffserklärungsseite gemacht und diesen Artikel als ''Funi (Seilbahn)'' angelegt. Dort stand früher zuoberst drin ''Funi (frz.) ist die Abkürzung von Funiculaire, Standseilbahn.'' und es folgte eine Beschreibung von ''Schlittenseilbahn'' und der Seilbahn Funiculaire Neuveville–St.Pierre. Damit nicht zwei Begriffe im gleichen Artikel vorkommen, wurde aus dem Teil ''Funi-Schlitten - ein Unikum'' der Artikel Schlitten-Standseilbahn und aus dem anderen Wasserballastbahn. Das fand alles im Dezember 2008 statt, war nicht von mir und hat niemanden gestört bis ich aus dem WP:TF Begriff ''Schlitten-Standseilbahn'' den gebräuchlichen und eindeutigen Begriff ''Schlittenseilbahn'' für den Artikeltitel gewählt habe, worauf dann Meinungen kamen, dass ''Funi (Seilbahn)'' der einzige richtige Artikeltitel wäre, der aber gar nicht eindeutig ist.\\nFuni (Seilbahn) liefert bei Google schon auf der ersten Suchresultat Link zu Funiculaire Neuveville–St.Pierre, Seiten zu anderen Themen und Seiten wo ''Funi'' nur in Anführungszeichen vorkommt. Schlittenseilbahn liefert 450 Treffer, die nur das Thema dieses Artikels behandeln.\\nHier noch etwas Statistik zu den Wikipedia Artikeln mit Wikipedia article traffic statistics Schlittenseilbahn und Schlitten-Standseilbahn wurde zusammen 76-mal angeschaut im September 2009, Funi 27-mal, Funi (Seilbahn) 15-mal und Funi-Schlitten 3-mal, wir wissen allerdings nicht, was die Leute sehen wollten. Pechristener 22:37, 10. Sep. 2009 (CEST)\",\n", + " 'replyTo_id': '64211111.132.132',\n", + " 'page_id': '2160865',\n", + " 'indentation': 0,\n", + " 'authors': ['5088:Pechristener'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '5088',\n", + " 'type': 'MODIFICATION',\n", + " 'id': '64413313.11323.11323',\n", + " 'ancestor_id': '64377492.11323.11323',\n", + " 'rev_id': 64413313},\n", + " {'isUnchanged': False,\n", + " 'page_title': 'Diskussion:Schlittenseilbahn',\n", + " 'parent_id': None,\n", + " 'user_text': 'Blauer Heinrich',\n", + " 'timestamp': '2009-09-26T21:32:27Z',\n", + " 'content': 'Da ist nun ein Heidenchaos entstanden. Ich lese mir nicht alles durch. ZUerst ein Hinweis, warum die Abkürzung Funi so beliebt war, ein berühmter Song ging ab 1880 um die Welt: [[http://it.wikipedia.org/wiki/Funicul%C3%AC_funicul%C3%A0 Italienische Wiki]] oder hier: [[http://en.wikipedia.org/wiki/Funicul%C3%AC,_Funicul%C3%A0 Englische Version]]. Es war somit ein Modewort und darum identifizierte ein Unternehmen sich gerne damit, das war Geschäftsfördernd. Da auf den Vesuv 1880 streckenmässig zuerst eine Tramlinie und dann erst im oberen Abschnitt eine Standseilbahn fuhr [http://www.vesuvioinrete.it/funicolare/e_ferrovia_storia.htm Hist. Vesuvbahn] , vermischte sich auch bald der Ausdruck (es gibt noch mehr Beispiele, so blieb auch in Lausanne lange der Name, auch als die Bahn längst ein Zahnrad hatte). Funi ist ein populärer Volksausdruck für eine Bahn die in die Höhe strebt und die wenigsten kümmerte es noch heute ob es eine Standseilbahn, eine Zahnradbahn oder eine Strassenkabelbahn ist. Und darum ist Funi ein offener Begriff im Volksmund, abgeleitet von Funiculaire und umschreibt eine ursprünglich am Seil operierende Bahn. Die Funi-Schlitten gehören genau so dazu wie die Standseilbahnen. Dazu gibt es in Biel ein Unternehmen, den Funicar. Heute ein Transportunternehmen entstanden aus der Seilbahn Biel – Leubringen. Funi ist in der westlichen Schweiz ein volkstümlicher Begriff. Gruss, habe nun wohl nochmehr Chaos produziert. --[[Benutzer:Blauer Heinrich|Blauer Heinrich]] 23:32, 26. Sep. 2009 (CEST)\\n',\n", + " 'cleaned_content': 'Da ist nun ein Heidenchaos entstanden. Ich lese mir nicht alles durch. ZUerst ein Hinweis, warum die Abkürzung Funi so beliebt war, ein berühmter Song ging ab 1880 um die Welt: [Italienische Wiki] oder hier: [Englische Version]. Es war somit ein Modewort und darum identifizierte ein Unternehmen sich gerne damit, das war Geschäftsfördernd. Da auf den Vesuv 1880 streckenmässig zuerst eine Tramlinie und dann erst im oberen Abschnitt eine Standseilbahn fuhr Hist. Vesuvbahn , vermischte sich auch bald der Ausdruck (es gibt noch mehr Beispiele, so blieb auch in Lausanne lange der Name, auch als die Bahn längst ein Zahnrad hatte). Funi ist ein populärer Volksausdruck für eine Bahn die in die Höhe strebt und die wenigsten kümmerte es noch heute ob es eine Standseilbahn, eine Zahnradbahn oder eine Strassenkabelbahn ist. Und darum ist Funi ein offener Begriff im Volksmund, abgeleitet von Funiculaire und umschreibt eine ursprünglich am Seil operierende Bahn. Die Funi-Schlitten gehören genau so dazu wie die Standseilbahnen. Dazu gibt es in Biel ein Unternehmen, den Funicar. Heute ein Transportunternehmen entstanden aus der Seilbahn Biel – Leubringen. Funi ist in der westlichen Schweiz ein volkstümlicher Begriff. Gruss, habe nun wohl nochmehr Chaos produziert. Blauer Heinrich 23:32, 26. Sep. 2009 (CEST)',\n", + " 'replyTo_id': '64211111.132.132',\n", + " 'page_id': '2160865',\n", + " 'indentation': 0,\n", + " 'authors': ['211870:Blauer Heinrich'],\n", + " 'conversation_id': '64211111.132.132',\n", + " 'user_id': '211870',\n", + " 'type': 'ADDITION',\n", + " 'id': '64951299.13422.13422',\n", + " 'ancestor_id': '64951299.13422.13422',\n", + " 'rev_id': 64951299}]}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "convo = dataset[0]\n", + "convo" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "dd13eb5e-d395-4dc5-9fa8-be6012cb9a8f", + "metadata": {}, + "outputs": [], + "source": [ + "def format_wiki_german_convo(convo, truncated_by = 0, start_at = 0): \n", + " utt_list = convo['utterances']\n", + " transcription = []\n", + " spk_list = {}\n", + " utt_list = utt_list[:len(utt_list) - truncated_by]\n", + " utt_list = utt_list[start_at:]\n", + " for utt in utt_list:\n", + " sp = utt['authors'][0]\n", + " if sp not in spk_list.keys():\n", + " spk_list[sp] = len(spk_list) + 1\n", + " transcription.append(\"SPEAKER\"+str(spk_list[sp]) +\": \"+utt['cleaned_content'])\n", + " return transcription" + ] + }, + { + "cell_type": "markdown", + "id": "f913f392-c79f-4fcf-ad37-14e69920c5d7", + "metadata": {}, + "source": [ + "### Generating SoP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b82098fd", + "metadata": {}, + "outputs": [], + "source": [ + "wiki_german_SCD_prompt = \"\"\"\n", + "Write a short summary capturing the trajectory of a Wikipedia talk-page discussion. Do not include specific article content, titles, policy names, diffs/edits, quotes, or concrete claims. The style you should avoid is illustrated in Example Sentence 1: “Speaker1 insisted an article include a particular detail and cited a specific policy by name. Speaker2 countered with a different policy and argued that the section should be removed. Speaker3 referenced a prior version and proposed a precise rewrite.” Instead, you should include indicators of sentiments (e.g., sarcasm, politeness, frustration), intentions (e.g., agreement, disagreement, rebuttal, concession, clarification, accusation), and strategies (e.g., consensus attempts, moderation, revert-restore cycles, rhetorical questions, appeals to emotion). The following sentences demonstrate the style you should follow: Example Sentence 2: “Both speakers hold differing views and become defensive. Speaker1 diminishes the weight of Speaker2’s reasoning, and Speaker2 blames Speaker1 for an uncivil tone. Both accuse each other of focusing on personal traits rather than reasoning.” Example Sentence 3: “The speakers refute each other with back-and-forth accusations. Persistent fault-finding and critical stances escalate tension and hinder productive discussion.” Overall, the trajectory summary should capture the key moments where the discussion’s tone or coordination changes. Here is an example of a complete trajectory summary: Multiple speakers discuss possible changes. Several present differing stances in sequence, building on and contesting each other’s reasoning. Speaker1 disputes a point from Speaker2, prompting a rebuttal. Speaker3 supports Speaker1, after which Speaker2 defends their position. Later, a speaker references a removed remark and offers an extended counter. Despite friction, the tone remains mostly civil with attempts at consensus. Now, provide the trajectory summary for the following conversation. Conversation Transcript: {formatted_object}. Now, summarize this conversation. Remember, do not include specific topics, claims, policies, or edits. Instead, capture the speakers’ sentiments, intentions, and strategies. Limit the trajectory summary to 80 words. Trajectory Summary (in English):\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ceb8107", + "metadata": {}, + "outputs": [], + "source": [ + "wiki_german_sop_prompt = \"\"\"\n", + "Here is a trajectory summary of a conversation that lays out how the dynamics of the conversation developed. You need to parse the summary into events in order. \n", + "Follow the following guidelines:\n", + "1. Try to maintain the original language of the summary as much as you can. \n", + "2. Provide your output as a Python dictionary with the following structure:\n", + "_(Note: Do NOT use markdown, JSON formatting, or code block delimiters.)_ \n", + "{{\n", + " '0': \"\" // description of the event\n", + " '1': ...\n", + " ...\n", + "}}\n", + "Here is the summary:\n", + "{formatted_object}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aecf6975", + "metadata": {}, + "outputs": [], + "source": [ + "scd_transformer = SCD(model_provider=MODEL_PROVIDER, \n", + " model=MODEL, \n", + " config=config, \n", + " custom_scd_prompt=wiki_german_SCD_prompt, \n", + " custom_sop_prompt=wiki_german_sop_prompt,\n", + " custom_prompt_dir=\"wiki_german\")\n", + "condyns = ConDynS(model_provider=MODEL_PROVIDER, \n", + " model=MODEL, \n", + " config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "243ab31c-9503-4dca-9489-56dc6f80ff38", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating SCDs for conversations: 100%|██████| 100/100 [01:40<00:00, 1.00s/it]\n" + ] + } + ], + "source": [ + "# For non-ConvoKit data, we need to create a corpus to use the SCD transformer directly\n", + "\n", + "from convokit import Corpus, Conversation, Utterance, Speaker\n", + "\n", + "# Create ConvoKit objects from the raw data\n", + "utterances = []\n", + "speakers = {}\n", + "conversations = {}\n", + "\n", + "for convo in dataset:\n", + " convo_id = convo['convo_id']\n", + " \n", + " for i, utt_data in enumerate(convo['utterances']):\n", + " speaker_id = utt_data['authors'][0]\n", + " if speaker_id not in speakers:\n", + " speakers[speaker_id] = Speaker(id=speaker_id)\n", + " \n", + " convo_utterances = []\n", + " for i, utt_data in enumerate(convo['utterances']):\n", + " speaker_id = utt_data['authors'][0]\n", + " utt_id = f\"{convo_id}_{i}\"\n", + " \n", + " utterance = Utterance(\n", + " id=utt_id,\n", + " speaker=speakers[speaker_id],\n", + " conversation_id=convo_id,\n", + " text=utt_data['cleaned_content']\n", + " )\n", + " utterances.append(utterance)\n", + " convo_utterances.append(utterance)\n", + " \n", + " conversations[convo_id] = Conversation(\n", + " id=convo_id,\n", + " utterances=convo_utterances\n", + " )\n", + "\n", + "temp_corpus = Corpus(utterances=utterances)\n", + "\n", + "def format_wiki_german_conversation(conversation):\n", + " return \"\\n\\n\".join(format_wiki_german_convo({'utterances': [\n", + " {'authors': [utt.speaker.id], 'cleaned_content': utt.text} \n", + " for utt in conversation.get_chronological_utterance_list()\n", + " ]}))\n", + "\n", + "scd_transformer.conversation_formatter = format_wiki_german_conversation\n", + "\n", + "scd_transformer.transform(temp_corpus)\n", + "\n", + "time_analysis_scd = {}\n", + "bulletpoints = {}\n", + "for convo_id in [convo['convo_id'] for convo in dataset]:\n", + " convo = temp_corpus.get_conversation(convo_id)\n", + " time_analysis_scd[convo_id] = convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoints[convo_id] = convo.meta.get(\"machine_sop\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66eb37fa-d14b-412a-97a8-056063f2e312", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + f\"wiki_german_100_scd.json\", 'w') as file:\n", + " json.dump(time_analysis_scd, file, indent=4)\n", + "\n", + "with open(DATA_PATH + f\"wiki_german_100_sop.json\", 'w') as file:\n", + " json.dump(bulletpoints, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f3721855-27d0-490b-9017-b54ccb1e5421", + "metadata": {}, + "outputs": [], + "source": [ + "similarity_and_bulletpoints = {\"scd\" : time_analysis_scd, \"bulletpoints\" : bulletpoints}" + ] + }, + { + "cell_type": "markdown", + "id": "5a7ad80c-11d6-4f7e-b457-5bd907786558", + "metadata": {}, + "source": [ + "### Compute ConDynS Scores" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d67ee225-edf5-478e-bd6f-5362290f41cd", + "metadata": {}, + "outputs": [], + "source": [ + "convo_id_to_convo = {}\n", + "for convo in dataset:\n", + " convo_id_to_convo[convo['convo_id']] = convo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f591d844-567d-491d-82f6-b43a5bb46ca6", + "metadata": {}, + "outputs": [], + "source": [ + "num = 100\n", + "convo_ids = [convo['convo_id'] for convo in dataset]\n", + "assert len(convo_ids) == num\n", + "\n", + "all_combos = list(combinations(convo_ids, 2))\n", + "convo_scores = {}\n", + "\n", + "# Define custom formatter for ConDynS\n", + "def wiki_german_formatter(conversation):\n", + " # Convert conversation back to the format expected by format_wiki_german_convo\n", + " convo_data = {'utterances': [\n", + " {'authors': [utt.speaker.id], 'cleaned_content': utt.text} \n", + " for utt in conversation.get_chronological_utterance_list()\n", + " ]}\n", + " return \"\\n\\n\".join(format_wiki_german_convo(convo_data))\n", + "\n", + "for convo_id1, convo_id2 in tqdm(all_combos, desc=\"Calculating pairs similarity\"):\n", + " if convo_id1 + \"_\" + convo_id2 in convo_scores or convo_id2 + \"_\" + convo_id1 in convo_scores:\n", + " continue\n", + " \n", + " # Use the new compare_conversations method\n", + " result, score = condyns.compare_conversations(\n", + " temp_corpus, convo_id1, convo_id2, \n", + " sop_meta_name=\"machine_sop\",\n", + " formatter=wiki_german_formatter\n", + " )\n", + " \n", + " convo_scores[convo_id1 + \"_\" + convo_id2] = {\"result\": result, \"score\": score}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52bda795-8f7f-49c4-8ef0-24731e097d38", + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_PATH + f\"wiki_german_100_scores.json\", 'w') as file:\n", + " json.dump(convo_scores, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fd3f898-3194-40ba-b484-7a7092db316c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'0': {'analysis': 'Speaker1 proposes mentioning a Verbrauch value, providing context and justifications based on test results.',\n", + " 'score': 0.8},\n", + " '1': {'analysis': 'No disagreement or defense of an original term is present in the provided transcript.',\n", + " 'score': 0.0},\n", + " '2': {'analysis': 'No compromise or further advocacy from Speaker1 is evident in the provided transcript.',\n", + " 'score': 0.0},\n", + " '3': {'analysis': 'No alignment of Speaker3 and Speaker2 against Speaker1 is present.',\n", + " 'score': 0.0},\n", + " '4': {'analysis': 'Speaker1 does not persist in their viewpoint or refute others claims.',\n", + " 'score': 0.0},\n", + " '5': {'analysis': 'No direct attack on Speaker1 reasoning or dismissal of understanding occurs.',\n", + " 'score': 0.0},\n", + " '6': {'analysis': 'No support for an attacker position is present in the transcript.',\n", + " 'score': 0.0},\n", + " '7': {'analysis': 'No historical explanation or acknowledgment of complexity is offered.',\n", + " 'score': 0.0}}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "convo_scores['64211111.132.132_152670880.5344.5344'][\"result\"][0]" + ] + }, + { + "cell_type": "markdown", + "id": "c127890c-99df-4a90-93b2-c986bc8629d7", + "metadata": {}, + "source": [ + "# Clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "665d7b10-f2a5-4ace-b262-07dfa2b07995", + "metadata": {}, + "outputs": [], + "source": [ + "def get_similarity(convo_id1, convo_id2):\n", + " if convo_id1 + \"_\" + convo_id2 in convo_scores:\n", + " return convo_scores[convo_id1 + \"_\" + convo_id2][\"score\"]\n", + " elif convo_id2 + \"_\" + convo_id1 in convo_scores:\n", + " return convo_scores[convo_id1 + \"_\" + convo_id2][\"score\"]\n", + " else:\n", + " print(\"Did not find the score\")\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "973bb148-c5a1-44d9-ad26-624be3f8c2d9", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from scipy.cluster.hierarchy import dendrogram, linkage\n", + "from scipy.spatial.distance import squareform\n", + "\n", + "# Step 1: Create the distance matrix\n", + "n = len(convo_ids)\n", + "distance_matrix = np.zeros((n, n))\n", + "\n", + "# Fill the distance matrix\n", + "for i in range(n):\n", + " for j in range(i + 1, n):\n", + " convo1, convo2 = convo_ids[i], convo_ids[j]\n", + " similarity = np.sum(get_similarity(convo1, convo2))\n", + " distance = 2 - similarity # Convert similarity to distance\n", + " distance_matrix[i, j] = distance_matrix[j, i] = distance # Symmetric matrix\n", + "\n", + "# Convert to condensed format for linkage function\n", + "condensed_dist_matrix = squareform(distance_matrix)\n", + "\n", + "# Step 2: Perform hierarchical clustering\n", + "linkage_matrix = linkage(condensed_dist_matrix, method=\"ward\") # Ward's method minimizes variance\n", + "\n", + "\n", + "from scipy.cluster.hierarchy import fcluster\n", + "from collections import defaultdict\n", + "\n", + "top_level_clusters = fcluster(linkage_matrix, t=2, criterion='maxclust')\n", + "\n", + "clusters = defaultdict(list)\n", + "for idx, label in enumerate(top_level_clusters):\n", + " clusters[label].append(idx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "293e7c54", + "metadata": {}, + "outputs": [], + "source": [ + "# clusters[1] and clusters[2] are your top-level clusters\n", + "cluster1 = [time_analysis_scd[convo_ids[i]] for i in clusters[1]]\n", + "cluster2 = [time_analysis_scd[convo_ids[i]] for i in clusters[2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b88a16df", + "metadata": {}, + "outputs": [], + "source": [ + "# from https://github.com/jmhessel/FightingWords/blob/master/fighting_words_py3.py\n", + "exclude = set(string.punctuation)\n", + "\n", + "def basic_sanitize(in_string):\n", + " '''Returns a very roughly sanitized version of the input string.'''\n", + " in_string = ''.join([ch for ch in in_string if ch not in exclude])\n", + " in_string = in_string.lower()\n", + " in_string = ' '.join(in_string.split())\n", + " return in_string\n", + "\n", + "def bayes_compare_language(l1, l2, ngram = 1, prior=.01, cv = None):\n", + " '''\n", + " Arguments:\n", + " - l1, l2; a list of strings from each language sample\n", + " - ngram; an int describing up to what n gram you want to consider (1 is unigrams,\n", + " 2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.\n", + " - prior; either a float describing a uniform prior, or a vector describing a prior\n", + " over vocabulary items. If you're using a predefined vocabulary, make sure to specify that\n", + " when you make your CountVectorizer object.\n", + " - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.\n", + "\n", + " Returns:\n", + " - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''\n", + " if cv is None and type(prior) is not float:\n", + " print(\"If using a non-uniform prior:\")\n", + " print(\"Please also pass a count vectorizer with the vocabulary parameter set.\")\n", + " quit()\n", + " l1 = [basic_sanitize(l) for l in l1]\n", + " l2 = [basic_sanitize(l) for l in l2]\n", + " if cv is None:\n", + " cv = CV(decode_error = 'ignore', min_df=2, max_df=0.9, ngram_range=(1,ngram),\n", + " binary = False,\n", + " max_features = 15000)\n", + " counts_mat = cv.fit_transform(l1+l2).toarray()\n", + " # Now sum over languages...\n", + " vocab_size = len(cv.vocabulary_)\n", + " print(\"Vocab size is {}\".format(vocab_size))\n", + " if type(prior) is float:\n", + " priors = np.array([prior for i in range(vocab_size)])\n", + " else:\n", + " priors = prior\n", + " z_scores = np.empty(priors.shape[0])\n", + " count_matrix = np.empty([2, vocab_size], dtype=np.float32)\n", + " count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)\n", + " count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)\n", + " a0 = np.sum(priors)\n", + " n1 = 1.*np.sum(count_matrix[0,:])\n", + " n2 = 1.*np.sum(count_matrix[1,:])\n", + " print(\"Comparing language...\")\n", + " for i in range(vocab_size):\n", + " #compute delta\n", + " term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))\n", + " term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))\n", + " delta = term1 - term2\n", + " #compute variance on delta\n", + " var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])\n", + " #store final score\n", + " z_scores[i] = delta/np.sqrt(var)\n", + " index_to_term = {v:k for k,v in cv.vocabulary_.items()}\n", + " sorted_indices = np.argsort(z_scores)\n", + " return_list = []\n", + " for i in sorted_indices:\n", + " return_list.append((index_to_term[i], z_scores[i]))\n", + " return return_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b99a0763-e1ab-4bc7-82b1-a4a29445fbb4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_fighting_words_matching_bullets(cluster1, cluster2, similarity_result=convo_scores, summaries_and_bullets=similarity_and_bulletpoints):\n", + " cluster1_combo = list(combinations(cluster1, 2))\n", + " matched_cluster1 = []\n", + " for convo_id1, convo_id2 in cluster1_combo:\n", + " key = f\"{convo_id1}_{convo_id2}\" if f\"{convo_id1}_{convo_id2}\" in similarity_result.keys() else f\"{convo_id2}_{convo_id1}\"\n", + " for k, result in enumerate(similarity_result[key][\"result\"]):\n", + " for index in result.keys():\n", + " if result[index]['score'] > 0.5:\n", + " if k == 0:\n", + " matched_cluster1.append(summaries_and_bullets['bulletpoints'][convo_id1][index])\n", + " else:\n", + " try:\n", + " matched_cluster1.append(summaries_and_bullets['bulletpoints'][convo_id2][index])\n", + " except Exception:\n", + " continue\n", + " \n", + " cluster2_combo = list(combinations(cluster2, 2))\n", + " matched_cluster2 = []\n", + " for convo_id1, convo_id2 in cluster2_combo:\n", + " key = f\"{convo_id1}_{convo_id2}\" if f\"{convo_id1}_{convo_id2}\" in similarity_result.keys() else f\"{convo_id2}_{convo_id1}\"\n", + " for k, result in enumerate(similarity_result[key][\"result\"]):\n", + " for index in result.keys():\n", + " if result[index]['score'] > 0.5:\n", + " if k == 0:\n", + " matched_cluster2.append(summaries_and_bullets['bulletpoints'][convo_id1][index])\n", + " else:\n", + " matched_cluster2.append(summaries_and_bullets['bulletpoints'][convo_id2][index])\n", + " \n", + " z_scores = bayes_compare_language(matched_cluster1, matched_cluster2, ngram = 3) \n", + " top_k = 15\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " top_k_class1 = list(reversed([(x[0], round(x[1],2)) for x in z_scores[-top_k:]]))\n", + " top_k_class2 = [(x[0], round(x[1],2)) for x in z_scores[:top_k]]\n", + " print(f\"Fighting Words Comments between:\")\n", + " print(\"Cluster1: \", top_k_class1)\n", + " print(\"Cluster2: \", top_k_class2)\n", + " return matched_cluster1, matched_cluster2" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "aeaae99a-2d51-49ef-b862-28b5cc1adbf2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(45, 55)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(clusters[1]), len(clusters[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "ce5b5eb7-a000-4615-bdd1-eb9bf725929c", + "metadata": {}, + "outputs": [], + "source": [ + "cluster1_ids = [convo_ids[i] for i in clusters[1]]\n", + "cluster2_ids = [convo_ids[i] for i in clusters[2]]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "3bf255f4-f543-476e-9dfe-790c5aea18d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vocab size is 6959\n", + "Comparing language...\n", + "Fighting Words Comments between:\n", + "Cluster1: [('message', 19.06), ('gratitude', 17.8), ('issue', 17.58), ('repeats', 16.04), ('responds with', 15.72), ('is', 15.18), ('helpful', 14.28), ('polite', 14.06), ('speaker2 responds with', 13.81), ('same', 12.86), ('tone is', 12.55), ('expresses gratitude', 12.31), ('the same', 11.93), ('with gratitude', 11.77), ('brief', 11.6)]\n", + "Cluster2: [('speaker3', -16.28), ('disagreement', -12.94), ('expressing', -11.49), ('confusion', -10.95), ('with question', -9.81), ('conversation with question', -9.76), ('enters', -9.18), ('additional', -8.4), ('speaker4', -8.18), ('point', -8.06), ('with question expressing', -7.93), ('question expressing', -7.93), ('speaker3 enters', -7.65), ('with speaker2', -7.64), ('defends', -7.6)]\n" + ] + } + ], + "source": [ + "cluster1_bulletpoints, cluster2_bulletpoints = get_fighting_words_matching_bullets(cluster1_ids, cluster2_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "a7bad7c9-c9c4-4c45-b3f9-955690222b04", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Speaker1 repeats the same information and question multiple times',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'The overall tone is informative and helpful.',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 reports an error, pointing out a specific issue with a previous edit',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 expresses concern, possibly about vandalism',\n", + " 'Speaker1 reiterates their initial concern',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a neutral notification directed at Speaker2, pointing out an oversight',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'Speaker1 initiates the conversation with a polite inquiry, expressing confusion and seeking clarification',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates a message',\n", + " 'Speaker1 sends a series of replies',\n", + " 'Speaker1 seems to be correcting an error',\n", + " 'Speaker1 indicates a possible intention to clarify or amend a previous message',\n", + " 'Self-correction suggests a desire for accuracy',\n", + " 'There is no indication of disagreement or conflict',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker3 enters the conversation',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates contact with a request',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 seeks advice, indicating some initial discouragement',\n", + " 'Speaker2 repeats their message with minor edits, possibly to correct errors or emphasize a point',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 initiates the conversation with a question expressing doubt',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a statement',\n", + " 'Speaker1 repeats the same statement, expressing a desire for content removal, deeming it irrelevant',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The overall tone is neutral but persistent',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 posts a link',\n", + " 'Speaker2 repeats the same link',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 presents a series of documents and excerpts.',\n", + " 'Speaker1 appears to be building a case by repeatedly presenting evidence.',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 seeks clarification and justification for a previous action',\n", + " 'Speaker1 intends to understand the seriousness of the action',\n", + " 'Speaker1 requests removal of the action if unwarranted',\n", + " 'Speaker2 avoids direct engagement in the initial discussion',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'Speaker1 initiates the conversation with a neutral intention, expressing a desire to contribute to Wikipedia based on their professional research skills',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 reports a potential issue',\n", + " 'The tone is neutral and transactional',\n", + " 'Both speakers are focused on reporting and resolving a technical problem',\n", + " 'There is no indication of disagreement, argumentation, or emotional expression',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 posts the same message repeatedly.',\n", + " 'Speaker1s intention seems to be to inform.',\n", + " 'Speaker1s method is repetitive and potentially unnecessary.',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 initiates the conversation with a neutral inquiry, offering guidance',\n", + " 'Speaker1 then clarifies their initial message',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 expresses doubt and challenges the verifiability of certain quotations, intending to revise an article',\n", + " 'Speaker1 partially retracts their initial statement, acknowledging the possible existence of the quotation but with reservations',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with an apologetic and slightly frustrated tone, seeking assistance and clarification',\n", + " 'Speaker1 then repeats their initial message, possibly due to a technical issue or oversight',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with gratitude and a polite suggestion.',\n", + " 'Speaker1 then asks a question.',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with an unclear statement',\n", + " 'Speaker1 seeks clarification with a direct question',\n", + " 'Speaker1 repeats their initial statement and question',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a correction',\n", + " 'Speaker1 provides evidence to support their claim',\n", + " 'Speaker1 suggests a revision',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a polite request',\n", + " 'Speaker1 points out inconsistencies',\n", + " 'Speaker1 questions Speaker2 claims',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a request for clarification and expresses confusion regarding a new category created by Speaker2',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation with a formal notification outlining issues with a file upload, providing detailed instructions and offering assistance.',\n", + " 'Speaker1 then repeats the initial notification verbatim.',\n", + " 'Speaker1 initiates the conversation with a question, seemingly seeking input or validation',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 introduces a list of categories',\n", + " 'Speaker3 repeats the list again, suggesting a possible loop or lack of progression in the conversation',\n", + " 'Speaker1 repeats the same information and question multiple times',\n", + " 'The repetition indicates persistence and perhaps a hint of frustration at the lack of response',\n", + " 'The repeated questioning suggests Speaker1 is trying to emphasize a point',\n", + " 'Speaker1 is possibly confused or insistent on getting an answer',\n", + " 'The overall tone is neutral but persistent',\n", + " 'Speaker1 initiates the conversation by offering a potential explanation, showing a helpful intention',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker1 expresses concern, possibly about vandalism',\n", + " 'Speaker1 reiterates their initial concern',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'The conversation appears to aim for mutual understanding and problem-solving, despite differing perspectives',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 gratefully acknowledges and validates Speaker1 suggestion',\n", + " 'Speaker1 offers further investigation',\n", + " 'Speaker1 expresses uncertainty with an apologetic tone',\n", + " 'Speaker2 responds with direct disagreement',\n", + " 'Speaker2 provides resources',\n", + " 'Speaker2 offers additional information',\n", + " 'Speaker2 offers a correction',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 then expresses agreement with Speaker2s assessment, clarifying their intentions to address structural issues and welcome newcomers, while also acknowledging potential resistance from established members',\n", + " 'The conversation appears to aim for mutual understanding and problem-solving, despite differing perspectives',\n", + " 'Speaker1 initiates the conversation with a polite inquiry, expressing confusion and seeking clarification',\n", + " 'Speaker2 responds with a detailed explanation, providing supporting evidence',\n", + " 'Speaker1 initiates a message',\n", + " 'Speaker1 sends a series of replies',\n", + " 'Speaker1 indicates a possible intention to clarify or amend a previous message',\n", + " 'There is no indication of disagreement or conflict',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker3 enters the conversation',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 seeks advice, indicating some initial discouragement',\n", + " 'Speaker2 responds with reassurance and encouragement',\n", + " 'Speaker2 emphasizes collective responsibility for maintaining a positive environment',\n", + " 'Speaker2 offers practical advice',\n", + " 'The overall interaction is positive and helpful',\n", + " 'Speaker1 then expresses agreement with Speaker2s assessment, clarifying their intentions to address structural issues and welcome newcomers, while also acknowledging potential resistance from established members',\n", + " 'Speaker1 initiates the conversation with a question expressing doubt',\n", + " 'Speaker2 responds with a negative assertion, providing external sources to support their claim',\n", + " 'Speaker2 then offers a concise summary',\n", + " 'Speaker2 proposes a correction',\n", + " 'Speaker1 acknowledges the validity of the doubt',\n", + " 'Speaker2 indicates further investigation',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'The conversation appears to aim for mutual understanding and problem-solving, despite differing perspectives',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 then expresses agreement with Speaker2s assessment, clarifying their intentions to address structural issues and welcome newcomers, while also acknowledging potential resistance from established members',\n", + " 'The conversation appears to aim for mutual understanding and problem-solving, despite differing perspectives',\n", + " 'Speaker1 initiates the conversation with a statement',\n", + " 'Speaker1 repeats the same statement, expressing a desire for content removal, deeming it irrelevant',\n", + " 'Speaker2 responds by agreeing with Speaker1 concern',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'The conversation appears to aim for mutual understanding and problem-solving, despite differing perspectives',\n", + " 'Speaker1 presents a series of documents and excerpts.',\n", + " 'Speaker1 appears to be building a case by repeatedly presenting evidence.',\n", + " 'Speaker1 acknowledges an error.',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 initiates the conversation with a slightly sarcastic tone',\n", + " 'Speaker1 seeks clarification and justification for a previous action',\n", + " 'Speaker1 intends to understand the seriousness of the action',\n", + " 'Speaker2 responds politely, acknowledging Speaker1 message',\n", + " 'Speaker2 directs Speaker1 to another location for the answer',\n", + " 'The exchange remains civil and brief',\n", + " 'Speaker2 avoids direct engagement in the initial discussion',\n", + " 'Speaker1 reports a potential issue',\n", + " 'Speaker2 responds with a resolution',\n", + " 'The tone is neutral and transactional',\n", + " 'Both speakers are focused on reporting and resolving a technical problem',\n", + " 'There is no indication of disagreement, argumentation, or emotional expression',\n", + " 'Speaker1 posts the same message repeatedly.',\n", + " 'Speaker2 responds with an update.',\n", + " 'The exchange between Speaker1 and Speaker2 repeats.',\n", + " 'Speaker2s tone appears neutral.',\n", + " 'Speaker1s intention seems to be to inform.',\n", + " 'Speaker1s method is repetitive and potentially unnecessary.',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 then clarifies their initial message',\n", + " 'Speaker1 provides a detailed explanation and advice in a helpful tone',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker1 initiates the conversation with an apologetic and slightly frustrated tone, seeking assistance and clarification',\n", + " 'Speaker2 responds with reassurance and provides a brief update, indicating a helpful intention',\n", + " 'Speaker1 then repeats their initial message, possibly due to a technical issue or oversight',\n", + " 'Speaker1 initiates the conversation with gratitude and a polite suggestion.',\n", + " 'Speaker1 then asks a question.',\n", + " 'Speaker2 responds positively, expressing pleasure and providing context.',\n", + " 'Speaker2 expresses a limitation but offers an alternative.',\n", + " 'Speaker2 responds with a detailed explanation and instructions',\n", + " 'Speaker2 seems to intend to be helpful and polite',\n", + " 'Speaker1 initiates the conversation with a correction',\n", + " 'Speaker1 provides evidence to support their claim',\n", + " 'Speaker1 suggests a revision',\n", + " 'Speaker2 responds with agreement',\n", + " 'Speaker3 introduces a related point',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 then expresses agreement with Speaker2s assessment, clarifying their intentions to address structural issues and welcome newcomers, while also acknowledging potential resistance from established members',\n", + " 'The conversation appears to aim for mutual understanding and problem-solving, despite differing perspectives',\n", + " 'Speaker1 initiates the conversation with a polite request',\n", + " 'Speaker1 transitions to a more direct tone',\n", + " 'Speaker2 initially responds with gratitude',\n", + " 'Speaker2 provides justifications',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 responds politely',\n", + " 'Speaker2 offers an alternative idea for future consideration',\n", + " 'Both speakers demonstrate a willingness to compromise',\n", + " 'Both speakers consider different perspectives',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 politely requests a change in Speaker2 behavior, providing a rationale',\n", + " 'Speaker2 responds with agreement and understanding',\n", + " 'Speaker1 relates to Speaker2 previous behavior with understanding',\n", + " 'Speaker1 offers constructive feedback',\n", + " 'Speaker2 demonstrates a willingness to adapt',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker2 responds with a lengthy, metaphorical reflection, disagreeing with Speaker1s sense of impending collapse',\n", + " 'Speaker1 then expresses agreement with Speaker2s assessment, clarifying their intentions to address structural issues and welcome newcomers, while also acknowledging potential resistance from established members',\n", + " 'The conversation appears to aim for mutual understanding and problem-solving, despite differing perspectives',\n", + " 'Speaker2 defends the new category by providing a rationale',\n", + " 'Speaker1 initiates the conversation by acknowledging Speaker2s objections and affirming shared goals',\n", + " 'Speaker1 initiates the conversation by offering a potential explanation, showing a helpful intention',\n", + " 'Speaker2 responds with gratitude, but politely disagrees with Speaker1 suggestion',\n", + " 'Speaker2 provides counter-evidence',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 politely informs Speaker2 about a mistake they made.',\n", + " 'Speaker1 points out a readily available piece of information that would have prevented the mistake.',\n", + " 'Speaker1s tone is helpful and slightly corrective.',\n", + " 'Speaker2 responds with gratitude.',\n", + " 'The overall tone is civil.',\n", + " 'Speaker2 concedes to Speaker1s correction.',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 reports an error, pointing out a specific issue with a previous edit',\n", + " 'Speaker2 acknowledges the problem',\n", + " 'Speaker2 expresses gratitude for the notification',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker1 expresses concern, possibly about vandalism',\n", + " 'Speaker2 interjects with seemingly nonsensical input',\n", + " 'Speaker3 attempts to offer a suggestion or continuation of Speaker1 initial point',\n", + " 'Speaker2 then repeats their nonsensical input, disrupting the flow',\n", + " 'Speaker1 reiterates their initial concern',\n", + " 'Speaker3 repeats their previous statement, indicating a possible attempt to ignore or override Speaker2 disruptive input and return to the original topic',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 initiates the conversation with a neutral notification directed at Speaker2, pointing out an oversight',\n", + " 'Speaker2 responds apologetically and indicates that the issue has been corrected',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 gratefully acknowledges and validates Speaker1 suggestion',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker2 responds with a detailed explanation, providing supporting evidence',\n", + " 'Speaker1 expresses gratitude for the thorough response',\n", + " 'Speaker1 initiates a message',\n", + " 'Speaker1 sends a series of replies',\n", + " 'Self-correction suggests a desire for accuracy',\n", + " 'There is no indication of disagreement or conflict',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker1 notifies Speaker2 about a file issue in a neutral, informative tone',\n", + " 'Speaker2 responds with gratitude and confirms the issue has been resolved',\n", + " 'Speaker3 enters the conversation',\n", + " 'Speaker3 indicates they have taken action to address the issue by removing certain elements',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 initiates contact with a request',\n", + " 'Speaker2 responds with an explanation for their delayed response, indicating technical difficulties',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 informs Speaker2 about a file deletion due to missing information, offering assistance for restoration',\n", + " 'Speaker2 responds with a brief statement indicating the issue is resolved',\n", + " 'Speaker1 then repeats the initial message about the file deletion and offer of assistance',\n", + " 'Speaker2 reiterates that the problem is resolved',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 expresses gratitude for a welcoming message',\n", + " 'The tone is polite and supportive',\n", + " 'Speaker2 repeats their message with minor edits, possibly to correct errors or emphasize a point',\n", + " 'The overall interaction is positive and helpful',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 initiates the conversation with a question expressing doubt',\n", + " 'Speaker2 responds with a negative assertion, providing external sources to support their claim',\n", + " 'Speaker2 then offers a concise summary',\n", + " 'Speaker2 proposes a correction',\n", + " 'Speaker1 acknowledges the validity of the doubt',\n", + " 'Speaker2 expresses gratitude',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 initiates the conversation with a statement',\n", + " 'Speaker1 repeats the same statement, expressing a desire for content removal, deeming it irrelevant',\n", + " 'Speaker2 responds by agreeing with Speaker1 concern',\n", + " 'Speaker1 expresses gratitude',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 expresses gratitude to Speaker2 for their continued assistance.',\n", + " 'Speaker2 responds with a polite and appreciative acknowledgment.',\n", + " 'The interaction is characterized by agreement and positive sentiment.',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 presents a series of documents and excerpts.',\n", + " 'Speaker1 appears to be building a case by repeatedly presenting evidence.',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker2 responds politely, acknowledging Speaker1 message',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker2 responds with gratitude and provides helpful links, demonstrating a supportive intention',\n", + " 'Speaker2 repeats the message with slight modifications, indicating a persistent helpfulness and offering alternative access methods, maintaining a polite and informative tone throughout',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 reports a potential issue',\n", + " 'Speaker2 responds with a resolution',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 responding with a resolution is repeated',\n", + " 'The tone is neutral and transactional',\n", + " 'Both speakers are focused on reporting and resolving a technical problem',\n", + " 'There is no indication of disagreement, argumentation, or emotional expression',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 posts the same message repeatedly.',\n", + " 'Speaker2 responds with an update.',\n", + " 'The exchange between Speaker1 and Speaker2 repeats.',\n", + " 'Speaker2s tone appears neutral.',\n", + " 'Speaker1s intention seems to be to inform.',\n", + " 'Speaker1s method is repetitive and potentially unnecessary.',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 initiates the conversation with a neutral inquiry, offering guidance',\n", + " 'Speaker1 then clarifies their initial message',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker2 acknowledges the information and expresses gratitude',\n", + " 'Speaker1 repeats the exact same message to Speaker2',\n", + " 'Speaker2 again acknowledges the information and expresses gratitude, indicating a neutral and receptive stance',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 initiates the conversation with an apologetic and slightly frustrated tone, seeking assistance and clarification',\n", + " 'Speaker2 responds with reassurance and provides a brief update, indicating a helpful intention',\n", + " 'Speaker1 then repeats their initial message, possibly due to a technical issue or oversight',\n", + " 'Speaker2 then repeats their initial message',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker2 expresses gratitude again',\n", + " 'Speaker2 responds positively, expressing pleasure and providing context.',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 initiates the conversation with an unclear statement',\n", + " 'Speaker2 responds with a detailed explanation and instructions',\n", + " 'Speaker2 seems to intend to be helpful and polite',\n", + " 'Speaker2 repeats the same information multiple times, with slight variations',\n", + " 'Speaker2 possibly intends to be friendly',\n", + " 'Speaker1 repeats their initial statement and question',\n", + " 'Speaker2 repeats the same information again',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker1 initiates the conversation with a correction',\n", + " 'Speaker1 provides evidence to support their claim',\n", + " 'Speaker1 suggests a revision',\n", + " 'Speaker2 responds with agreement',\n", + " 'Speaker3 introduces a related point',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker3 informs Speaker2 of a new development',\n", + " 'Speaker2 expresses gratitude again',\n", + " 'Speaker1 initiates the conversation with a polite request',\n", + " 'Speaker1 transitions to a more direct tone',\n", + " 'Speaker1 points out inconsistencies',\n", + " 'Speaker2 initially responds with gratitude',\n", + " 'Speaker2 initially responds with compliance',\n", + " 'Speaker3 enters the conversation to support Speaker1 initial claims',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 responds politely',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 politely requests a change in Speaker2 behavior, providing a rationale',\n", + " 'Speaker2 responds with agreement and understanding',\n", + " 'Speaker1 expresses gratitude',\n", + " 'Speaker1 offers constructive feedback',\n", + " 'Speaker2 demonstrates a willingness to adapt',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 initiates the conversation with a formal notification outlining issues with a file upload, providing detailed instructions and offering assistance.',\n", + " 'Speaker2 responds much later with a brief acknowledgment, indicating understanding.',\n", + " 'Speaker1 then repeats the initial notification verbatim.',\n", + " 'Speaker2 reiterates the same brief acknowledgment.',\n", + " 'Speaker1 informs Speaker2 of an issue and suggests a solution',\n", + " 'Speaker2 responds with gratitude',\n", + " 'Speaker1 initiates the conversation by offering a potential explanation, showing a helpful intention',\n", + " 'The tone remains amicable and respectful, with Speaker2 expressing appreciation for Speaker1 assistance',\n", + " 'The exchange concludes with mutual politeness and no escalation of disagreement',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 politely informs Speaker2 about a mistake they made.',\n", + " 'Speaker1 points out a readily available piece of information that would have prevented the mistake.',\n", + " 'Speaker2 responds with gratitude.',\n", + " 'Speaker2 acknowledges their oversight.',\n", + " 'The overall tone is civil.',\n", + " 'Speaker2 concedes to Speaker1s correction.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker1 expresses concern, possibly about vandalism',\n", + " 'Speaker2 interjects with seemingly nonsensical input',\n", + " 'Speaker2 then repeats their nonsensical input, disrupting the flow',\n", + " 'Speaker1 reiterates their initial concern',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with a neutral notification directed at Speaker2, pointing out an oversight',\n", + " 'Speaker2 responds apologetically and indicates that the issue has been corrected',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 gratefully acknowledges and validates Speaker1 suggestion',\n", + " 'Speaker1 offers further investigation',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker1 initiates a message',\n", + " 'Speaker1 sends a series of replies',\n", + " 'The tone appears casual and informal',\n", + " 'There is no indication of disagreement or conflict',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 notifies Speaker2 about a file issue in a neutral, informative tone',\n", + " 'Speaker2 responds with gratitude and confirms the issue has been resolved',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates contact with a request',\n", + " 'Speaker2 responds with an explanation for their delayed response, indicating technical difficulties',\n", + " 'Speaker1 follows up with a question expressing impatience',\n", + " 'Speaker2 briefly acknowledges ongoing problems',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 informs Speaker2 about a file deletion due to missing information, offering assistance for restoration',\n", + " 'Speaker2 responds with a brief statement indicating the issue is resolved',\n", + " 'Speaker1 then repeats the initial message about the file deletion and offer of assistance',\n", + " 'Speaker2 reiterates that the problem is resolved',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'Speaker2 repeats their message with minor edits, possibly to correct errors or emphasize a point',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 then confirms the completion of the task',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with a statement',\n", + " 'Speaker1 repeats the same statement, expressing a desire for content removal, deeming it irrelevant',\n", + " 'Speaker2 responds by agreeing with Speaker1 concern',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'The tone is neutral and transactional.',\n", + " 'The exchange is then repeated verbatim.',\n", + " 'The interaction is characterized by agreement and positive sentiment.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 posts a link',\n", + " 'Speaker1 repeats Defekter Weblink, suggesting agreement or acknowledgement',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 presents a series of documents and excerpts.',\n", + " 'Speaker1 appears to be building a case by repeatedly presenting evidence.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker2 responds politely, acknowledging Speaker1 message',\n", + " 'The exchange remains civil and brief',\n", + " 'Speaker2 avoids direct engagement in the initial discussion',\n", + " 'Speaker2 responds with gratitude and provides helpful links, demonstrating a supportive intention',\n", + " 'Speaker2 repeats the message with slight modifications, indicating a persistent helpfulness and offering alternative access methods, maintaining a polite and informative tone throughout',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 reports a potential issue',\n", + " 'Speaker2 responds with a resolution',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 responding with a resolution is repeated',\n", + " 'The tone is neutral and transactional',\n", + " 'Both speakers are focused on reporting and resolving a technical problem',\n", + " 'There is no indication of disagreement, argumentation, or emotional expression',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 posts the same message repeatedly.',\n", + " 'Speaker2 responds with an update.',\n", + " 'The exchange between Speaker1 and Speaker2 repeats.',\n", + " 'Speaker2s tone appears neutral.',\n", + " 'Speaker1s intention seems to be to inform.',\n", + " 'Speaker1s method is repetitive and potentially unnecessary.',\n", + " 'Speaker1 initiates the conversation with a neutral inquiry, offering guidance',\n", + " 'Speaker1 then clarifies their initial message',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker2 acknowledges the information and expresses gratitude',\n", + " 'Speaker1 repeats the exact same message to Speaker2',\n", + " 'Speaker2 again acknowledges the information and expresses gratitude, indicating a neutral and receptive stance',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with an apologetic and slightly frustrated tone, seeking assistance and clarification',\n", + " 'Speaker2 responds with reassurance and provides a brief update, indicating a helpful intention',\n", + " 'Speaker1 then repeats their initial message, possibly due to a technical issue or oversight',\n", + " 'Speaker2 then repeats their initial message',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with an unclear statement',\n", + " 'Speaker1 seeks clarification with a direct question',\n", + " 'Speaker1 repeats their initial statement and question',\n", + " 'Speaker1 indicates the initial response was not helpful',\n", + " 'Speaker2 repeats the same information again',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with a correction',\n", + " 'Speaker1 provides evidence to support their claim',\n", + " 'Speaker2 responds with agreement',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with a polite request',\n", + " 'Speaker1 transitions to a more direct tone',\n", + " 'Speaker2 initially responds with compliance',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker1 reports an issue, seemingly through an automated message.',\n", + " 'Speaker2 responds with an acknowledgement of completion.',\n", + " 'The pattern of Speaker1 reporting an issue and Speaker2 acknowledging completion repeats.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'Speaker2 intends to confirm resolution.',\n", + " 'Speaker1 initiates the conversation with a formal notification outlining issues with a file upload, providing detailed instructions and offering assistance.',\n", + " 'Speaker2 responds much later with a brief acknowledgment, indicating understanding.',\n", + " 'Speaker1 then repeats the initial notification verbatim.',\n", + " 'Speaker2 reiterates the same brief acknowledgment.',\n", + " 'The tone is neutral and transactional.',\n", + " 'Speaker1 intends to report issues.',\n", + " 'The exchange concludes with mutual politeness and no escalation of disagreement',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 expresses concern, possibly about vandalism',\n", + " 'Speaker2 interjects with seemingly nonsensical input',\n", + " 'Speaker3 attempts to offer a suggestion or continuation of Speaker1 initial point',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'The tone is initially neutral, with a possible hint of passive-aggression in the summary request',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker2 responds with a detailed explanation, providing supporting evidence',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates a message',\n", + " 'Speaker1 sends a series of replies',\n", + " 'There is no indication of disagreement or conflict',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker3 enters the conversation',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Another user responds with a request for a summary, indicating the original post was too long',\n", + " 'Speaker1 then provides a brief summary, seemingly in response to the request',\n", + " 'The tone is initially neutral, with a possible hint of passive-aggression in the summary request',\n", + " 'Speaker2 responds with a negative assertion, providing external sources to support their claim',\n", + " 'Speaker2 then offers a concise summary',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Another user responds with a request for a summary, indicating the original post was too long',\n", + " 'The tone is initially neutral, with a possible hint of passive-aggression in the summary request',\n", + " 'Speaker1 initiates the conversation with a statement',\n", + " 'Speaker1 repeats the same statement, expressing a desire for content removal, deeming it irrelevant',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 posts a link',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 presents a series of documents and excerpts.',\n", + " 'Speaker1 appears to be building a case by repeatedly presenting evidence.',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 posts the same message repeatedly.',\n", + " 'Speaker2 responds with an update.',\n", + " 'The exchange between Speaker1 and Speaker2 repeats.',\n", + " 'Speaker2s tone appears neutral.',\n", + " 'Speaker1s intention seems to be to inform.',\n", + " 'Speaker1s method is repetitive and potentially unnecessary.',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'The tone is initially neutral, with a possible hint of passive-aggression in the summary request',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Another user responds with a request for a summary, indicating the original post was too long',\n", + " 'The tone is initially neutral, with a possible hint of passive-aggression in the summary request',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 then repeats their initial message, possibly due to a technical issue or oversight',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation with an unclear statement',\n", + " 'Speaker2 seems to intend to be helpful and polite',\n", + " 'Speaker1 repeats their initial statement and question',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation with a correction',\n", + " 'Speaker1 provides evidence to support their claim',\n", + " 'Speaker1 suggests a revision',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 offers a detailed explanation',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'The tone is initially neutral, with a possible hint of passive-aggression in the summary request',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'The tone is initially neutral, with a possible hint of passive-aggression in the summary request',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation with a formal notification outlining issues with a file upload, providing detailed instructions and offering assistance.',\n", + " 'Speaker2 responds much later with a brief acknowledgment, indicating understanding.',\n", + " 'Speaker1 then repeats the initial notification verbatim.',\n", + " 'Speaker2 reiterates the same brief acknowledgment.',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 introduces a list of categories',\n", + " 'Speaker1 initiates the conversation by posting a lengthy comment',\n", + " 'Speaker1 initiates the conversation by offering a potential explanation, showing a helpful intention',\n", + " 'Speaker2 responds with gratitude, but politely disagrees with Speaker1 suggestion',\n", + " 'Speaker2 provides counter-evidence',\n", + " 'The exchange concludes with mutual politeness and no escalation of disagreement',\n", + " 'Speaker1 politely requests that another user add appropriate licenses to uploaded images.',\n", + " 'Speaker1 provides helpful resources.',\n", + " 'The overall tone is informative and helpful.',\n", + " 'There is a focus on compliance and user support.',\n", + " 'Speaker1 politely informs Speaker2 about a mistake they made.',\n", + " 'Speaker1 points out a readily available piece of information that would have prevented the mistake.',\n", + " 'Speaker1s tone is helpful and slightly corrective.',\n", + " 'The overall tone is civil.',\n", + " 'Speaker2 acknowledges the problem',\n", + " 'Speaker1 expresses concern, possibly about vandalism',\n", + " 'Speaker1 politely requests that another user add appropriate licenses to uploaded images.',\n", + " 'There is a focus on compliance and user support.',\n", + " 'Speaker1 initiates the conversation with a neutral notification directed at Speaker2, pointing out an oversight',\n", + " 'Speaker1 provides helpful resources.',\n", + " 'The overall tone is informative and helpful.',\n", + " 'There is a focus on compliance and user support.',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 gratefully acknowledges and validates Speaker1 suggestion',\n", + " 'Speaker2 provides resources',\n", + " 'Speaker2 offers additional information',\n", + " 'Speaker1 politely requests that another user add appropriate licenses to uploaded images.',\n", + " 'The overall tone is informative and helpful.',\n", + " 'There is a focus on compliance and user support.',\n", + " 'Speaker1 initiates the conversation with a polite inquiry, expressing confusion and seeking clarification',\n", + " 'Speaker2 responds with a detailed explanation, providing supporting evidence',\n", + " 'Speaker1 initiates a message',\n", + " 'Speaker1 sends a series of replies',\n", + " 'Speaker1 seems to be correcting an error',\n", + " 'Speaker1 indicates a possible intention to clarify or amend a previous message',\n", + " 'The tone appears casual and informal',\n", + " 'There is no indication of disagreement or conflict',\n", + " 'Speaker1 politely requests that another user add appropriate licenses to uploaded images.',\n", + " 'Speaker1 warns of potential deletion due to copyright concerns.',\n", + " 'Speaker1 provides helpful resources.',\n", + " 'Speaker1 repeats the request for multiple images.',\n", + " 'Speaker2 offers assistance.',\n", + " 'The overall tone is informative and helpful.',\n", + " 'There is a focus on compliance and user support.',\n", + " 'Speaker1 notifies Speaker2 about a file issue in a neutral, informative tone',\n", + " 'Speaker1 politely requests that another user add appropriate licenses to uploaded images.',\n", + " 'Speaker1 repeats the request for multiple images.',\n", + " 'Speaker1 initiates contact with a request',\n", + " 'Speaker2 responds with an explanation for their delayed response, indicating technical difficulties',\n", + " 'Speaker1 warns of potential deletion due to copyright concerns.',\n", + " 'Speaker1 provides helpful resources.',\n", + " 'Speaker1 repeats the request for multiple images.',\n", + " 'The overall tone is informative and helpful.',\n", + " 'There is a focus on compliance and user support.',\n", + " 'Speaker1 informs Speaker2 about a file deletion due to missing information, offering assistance for restoration',\n", + " 'Speaker1 then repeats the initial message about the file deletion and offer of assistance',\n", + " 'Speaker2 offers practical advice',\n", + " 'The overall interaction is positive and helpful',\n", + " 'Speaker1 provides helpful resources.',\n", + " 'Speaker1 initiates the conversation with a question expressing doubt',\n", + " 'Speaker2 responds with a negative assertion, providing external sources to support their claim',\n", + " 'Speaker1 warns of potential deletion due to copyright concerns.',\n", + " 'Speaker2 reiterates the warning about deletion if licenses are not added.',\n", + " 'The overall tone is informative and helpful.',\n", + " 'There is a focus on compliance and user support.',\n", + " 'Speaker1 initiates the conversation with a statement',\n", + " 'Speaker1 repeats the same statement, expressing a desire for content removal, deeming it irrelevant',\n", + " 'Speaker2 responds by agreeing with Speaker1 concern',\n", + " 'Speaker2 raises a potential copyright issue',\n", + " 'Speaker2 responds with a correction and a suggestion, displaying a slightly condescending tone',\n", + " 'Speaker1 posts a link',\n", + " 'Speaker1 provides helpful resources.',\n", + " 'Speaker2 offers assistance.',\n", + " 'Speaker4 offers assistance',\n", + " 'Speaker1 presents a series of documents and excerpts.',\n", + " 'Speaker1 appears to be building a case by repeatedly presenting evidence.',\n", + " 'Speaker1 initiates the conversation with a slightly sarcastic tone',\n", + " 'Speaker1 seeks clarification and justification for a previous action',\n", + " 'Speaker1 intends to understand the seriousness of the action',\n", + " 'Speaker1 requests removal of the action if unwarranted',\n", + " 'Speaker2 responds politely, acknowledging Speaker1 message',\n", + " ...]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster1_bulletpoints" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "153f9e35-314d-4a28-8f79-6bafa547b16f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 initiates the conversation with a critical tone',\n", + " 'Speaker1 questions the introduction',\n", + " 'Speaker1 implies a deviation from scientific accuracy',\n", + " 'Speaker1 expresses a desire for factual representation',\n", + " 'Speaker2 responds defensively',\n", + " 'Speaker2 dismisses Speaker1s concerns',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 initiates the conversation with a question, expressing uncertainty',\n", + " 'Speaker2 responds with confirmation and clarification, adopting a slightly instructive tone',\n", + " 'Speaker3 then offers a correction and provides additional context, seemingly disagreeing with Speaker2 initial response',\n", + " 'Speaker4 proposes a practical solution to address potential confusion, showing a collaborative and helpful intention',\n", + " 'Speaker5 points out a potential error in a formula, adopting a corrective and precise tone',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker5 supports Speaker4 position',\n", + " 'Speaker1 makes a suggestion.',\n", + " 'Speaker2 politely disagrees with Speaker1, referencing guidelines.',\n", + " 'Speaker3 enters the conversation and proposes an alternative.',\n", + " 'Speaker4 strongly objects to Speaker3 suggestion, providing a rationale.',\n", + " 'Speaker2 expresses agreement with Speaker4, reinforcing their earlier stance and adding further justification.',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 raises a potential issue, pointing out a possible error',\n", + " 'Speaker1 reiterates the same concern regarding another instance',\n", + " 'Speaker2 questions the existence of the problem, offering a counter-explanation',\n", + " 'Speaker3 intervenes, agreeing with Speaker1 implied concern about a citation error',\n", + " 'Speaker3 provides evidence to support Speaker1 concern',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a question, expressing concern and disbelief',\n", + " 'Speaker2 responds with additional information and context',\n", + " 'Speaker2 introduces a potentially controversial viewpoint',\n", + " 'Speaker3 enters the conversation to strongly disagree with Speaker2',\n", + " 'Speaker2 defends their position',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 politely disagrees, providing a rationale for their preferred approach',\n", + " 'Speaker3 then enters, pointing out a potential flaw in Speaker2s reasoning',\n", + " 'Speaker2 concedes the point and expresses willingness to compromise',\n", + " 'Speaker3 offers a further suggestion',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'The conversation begins with a request for contact',\n", + " 'Speaker3 enters, expressing strong disagreement',\n", + " 'Speaker2 attempts to defend their actions by referencing an article',\n", + " 'Speaker3 dismisses Speaker2s defense',\n", + " 'Speaker3 insists on strict adherence to established knowledge',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker5 supports Speaker4 position',\n", + " 'There is a disagreement',\n", + " 'Speaker1 defends their position by providing links',\n", + " 'Speaker4 labels the initial idea as nonsense',\n", + " 'Speaker1 attempts a rebuttal',\n", + " 'Speaker2 re-enters to offer a concession',\n", + " 'Speaker5 quotes an article',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a question, expressing confusion about Speaker2 actions',\n", + " 'Speaker2 responds defensively, requesting a specific objection and implying Speaker1 is being overly bureaucratic',\n", + " 'Speaker1 then directly questions the purpose of Speaker2 changes, expressing disagreement',\n", + " 'Speaker2 is dismissive',\n", + " 'Speaker2 restates their position, showing persistent disagreement and a lack of willingness to explain their reasoning',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 initiates the conversation with a request for help and expresses confusion',\n", + " 'Speaker2 responds helpfully, offering to mediate a dispute with another user',\n", + " 'Speaker3 joins, providing advice',\n", + " 'Speaker2 disagrees with Speaker3 assessment, expressing strong conviction',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 initiates the conversation with multiple questions expressing confusion and requesting clarification',\n", + " 'Speakers 2 and 3 offer tentative suggestions',\n", + " 'Speaker1 expresses dissatisfaction, emphasizing the lack of clarity and hinting at deletion',\n", + " 'Speaker4 acknowledges the problem and offers a limited solution',\n", + " 'Speaker5 provides a potential answer and identifies errors',\n", + " 'Speaker6 thanks Speaker5, seeks confirmation, and references earlier confusion',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 presents a list of accusations.',\n", + " 'Speaker1 refutes each point in the list of accusations.',\n", + " 'The intention of both speakers appears to be defensive.',\n", + " 'Each speaker aims to justify a particular stance.',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 offers specific suggestions and identifies areas needing correction',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker2 responds defensively, offering an explanation and shifting blame to another source',\n", + " 'Speaker2 attempts to further clarify their position by repeating their explanation',\n", + " 'Speaker3 then echoes Speaker1 initial accusation, suggesting continued doubt or disagreement with Speaker2 defense',\n", + " 'The conversation is marked by accusation, defense, and a lack of immediate resolution',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker1 states a fact and provides a source',\n", + " 'Speaker2 expresses confusion and requests clarification',\n", + " 'Speaker3 enters the conversation to provide additional information and a source that seemingly resolves the initial uncertainty',\n", + " 'Speaker2 then makes an assumption based on the provided information',\n", + " 'Speaker4 expresses disagreement with Speaker2 assumption, providing counter-evidence and requesting further verification',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a question, expressing confusion about Speaker2 categorization.',\n", + " 'Speaker2 responds, seeking clarification and implying the category is a work in progress.',\n", + " 'Speaker1 reiterates their confusion, questioning the criteria used for categorization.',\n", + " 'Speaker1 highlights what they perceive as an inconsistency.',\n", + " 'The tone remains relatively neutral, with both speakers primarily focused on seeking and providing clarification.',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 initiates the conversation with an unclear statement',\n", + " 'Speaker1 expresses confusion and defensiveness',\n", + " 'Speaker1 attempts to justify a previous action',\n", + " 'The exchange then repeats verbatim',\n", + " 'The repetition indicates a possible technical issue or misunderstanding about the conversation flow',\n", + " 'The overall tone is initially defensive',\n", + " 'The tone becomes potentially confused due to the repetition',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker5 supports Speaker4 position',\n", + " 'Speaker6 enters much later, acknowledging the complexity and offering a historical explanation, potentially adding to the confusion',\n", + " 'The conversation begins with a suggestion',\n", + " 'Speaker2 expresses disagreement and provides justification based on naming conventions',\n", + " 'Speaker1 offers a rebuttal, citing common usage and questioning Speaker2 reasoning',\n", + " 'Speaker3 enters, reinforcing Speaker2 point by quoting policy and drawing an analogy',\n", + " 'Speaker1 concedes a point but raises new objections',\n", + " 'Years later, Speaker4 expresses confusion and disapproval, accusing others of biased thinking',\n", + " 'Speaker5 defends the prior decision, clarifying its scope',\n", + " 'Speaker4 persists with objections',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker1 initiates the conversation with a concern, expressing frustration about potentially losing his work',\n", + " 'Speaker1 responds defensively',\n", + " 'Speaker1 provides extensive explanations to address Speaker2 confusion',\n", + " 'Speaker1 justifies his actions',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 initiates the conversation with a question, expressing a need for information',\n", + " 'Speaker2 restates the question',\n", + " 'Speaker2 provides an answer, indicating a helpful intention',\n", + " 'Speaker3 adds further information, supplementing the previous answer and demonstrating a collaborative and informative intention',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker5 supports Speaker4 position',\n", + " 'Multiple users express similar opinions',\n", + " 'Speaker2 and Speaker3 agree on an initial point',\n", + " 'Speaker4 enters the conversation with disagreement',\n", + " 'There is a back-and-forth rebuttal between Speaker4 and Speaker3',\n", + " 'Speaker4 defends their position',\n", + " 'Speaker6 joins, adding more arguments',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker2 offers an unsolicited opinion, expressing disagreement and identifying unreliable sources',\n", + " 'Speaker3 enters the conversation with disagreement, employing rhetorical questions and sarcasm',\n", + " 'Speaker4 agrees with Speaker1, suggesting a solution while criticizing Speaker3',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a direct question, seeking clarification for a reverted change',\n", + " 'Speaker2 responds politely, offering an assumption as justification',\n", + " 'Speaker1 expresses disagreement, providing a detailed explanation and pointing out inconsistencies, using a slightly sarcastic tone',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker5 supports Speaker4 position',\n", + " 'The conversation begins with a request for clarification',\n", + " 'An explanation is provided',\n", + " 'There is initial disagreement regarding visual presentation',\n", + " 'Speaker2 defends their design choice',\n", + " 'Speaker2 concedes to a minor adjustment',\n", + " 'Speaker4 expresses strong disapproval',\n", + " 'Speaker4 states their intention to remove the element in question',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker1 points out an inconsistency',\n", + " 'Speaker1 provides supporting details',\n", + " 'Speaker2 repeats Speaker1 statement',\n", + " 'Speaker3 acknowledges the confusion',\n", + " 'Speaker3 validates Speaker1 claim with external evidence',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation by disagreeing with Speaker2 removal of a sentence',\n", + " 'Speaker2 rebuts with a detailed explanation, questioning the validity of Speaker1 sources and accusing them of presenting a biased viewpoint',\n", + " 'Speaker1 defends their sources, selectively quoting them to support their claims, while also downplaying opposing viewpoints',\n", + " 'The conversation involves persistent disagreement and defense of positions',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker2 responds defensively, disagreeing with the accusation',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 initiates the conversation by disagreeing with a stated fact',\n", + " 'Speaker2 then politely requests evidence to support Speaker1 claim',\n", + " 'Speaker1 responds by providing a quote from a publicly available source, intending to clarify the discrepancy and justify their initial disagreement',\n", + " 'The tone remains civil and informative throughout the exchange, with both speakers demonstrating a willingness to engage constructively',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 initiates the conversation with polite disagreement',\n", + " 'Speaker2 asserts their original intention',\n", + " 'Speaker1 then repeats their initial message',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker2 responds assertively, disagreeing with Speaker1',\n", + " 'Speaker2 reverts Speaker1 changes',\n", + " 'Speaker3 supports Speaker2 position',\n", + " 'Speaker4 attempts to de-escalate the situation',\n", + " 'Speaker4 suggests Speaker1 actions stem from a lack of awareness',\n", + " 'Speaker4 downplays the significance of the disagreement',\n", + " 'Speaker2 repeats their earlier statements, reinforcing their stance',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 initiates the conversation with a concerned and expectant tone',\n", + " 'Speaker2 responds with factual information, seemingly in agreement with the initial concern',\n", + " 'Speaker2 then reiterates the same information',\n", + " 'Speaker3 enters the conversation, providing additional details and context in a neutral tone',\n", + " 'Speaker2 adds a related piece of information, building upon Speaker3 contribution',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 poses a question and provides data, seemingly seeking confirmation or explanation',\n", + " 'Speaker2 responds with a terse, questioning calculation, implying disagreement or confusion regarding Speaker1 initial calculation',\n", + " 'The tone is neutral, but Speaker2 response hints at skepticism or a challenge to Speaker1 understanding',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 initiates the conversation with an inquiry, expressing confusion and requesting verification',\n", + " 'Speaker2 responds with a detailed explanation, conceding an initial inaccuracy but defending their overall point with supporting evidence',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 politely suggests improvements',\n", + " 'Speaker2 responds defensively, justifying their actions',\n", + " 'Speaker3 enters, expressing disagreement with Speaker2s information',\n", + " 'Speaker2 initially defends their position',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker5 supports Speaker4 position',\n", + " 'Speaker6 enters much later, acknowledging the complexity and offering a historical explanation, potentially adding to the confusion',\n", + " 'The conversation begins with a request for modification',\n", + " 'A disagreement and defense of an initial claim occurs',\n", + " 'Speaker1 challenges the basis of the claim',\n", + " 'Speaker2 provides further justification',\n", + " 'Speaker4 enters, offering evidence to support the initial claim',\n", + " 'Speaker2 continues to dispute the initial claim',\n", + " 'Speaker3 and Speaker4 provide additional sources and examples',\n", + " 'Speaker2 remains unconvinced',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a question that implies potential frustration or sarcasm',\n", + " 'Speaker2 responds by referencing a previous discussion and expressing agreement with a particular viewpoint',\n", + " 'Speaker1 then expresses disagreement and confusion, questioning Speaker2 action and stating a lack of supporting evidence',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker2 responds with a neutral sentiment, acknowledging Speaker1 point but also suggesting a more composed reaction',\n", + " 'Speaker1 then concedes to Speaker2 point, while also defending their initial motivation and expressing frustration that their attempt backfired',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker2 interjects with a clarification.',\n", + " 'Speaker2 seems to be defending Wst.',\n", + " 'Speaker2 disputes the legitimacy of the ban.',\n", + " 'Speaker2 intention appears to be a rebuttal of Speaker1 justification for the ban.',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 initiates the conversation with a defensive tone, asserting the superiority of their contribution',\n", + " 'Speaker2 responds with disagreement, employing a passive-aggressive approach by questioning the fairness of the situation and implying a conflict of interest',\n", + " 'Speaker3 then enters, offering a more neutral and balanced perspective, acknowledging both sides strengths and weaknesses',\n", + " 'Speaker4 joins, providing a comparative analysis and ultimately supporting Speaker1 contribution with a justification for the change',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a request',\n", + " 'Speaker2 responds with disagreement, citing policy',\n", + " 'Speaker3 expresses confusion and disagreement with Speaker2 stance, using rhetorical questions',\n", + " 'Speaker2 defends their position by quoting policy',\n", + " 'Speaker2 remains firm, questioning Speaker3 credibility',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with an inquiry, expressing initial uncertainty',\n", + " 'Speaker2 responds with a hypothetical scenario, seeking clarification from Speaker1',\n", + " 'Speaker1 then provides additional information to support the discussion',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker4 enters, directly attacking Speaker1 reasoning and dismissing their understanding',\n", + " 'Speaker1 questions the necessity of a statement',\n", + " 'Speaker2 introduces a related point, expressing a sense of unfairness',\n", + " 'Speaker3 responds by requesting a verifiable source, indicating skepticism',\n", + " 'Speaker4 offers a potential source but expresses uncertainty',\n", + " 'Speaker5 enters the conversation to disagree with Speaker2 initial point, asserting a counter-argument with conviction',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 makes an initial assertion',\n", + " 'Speaker2 expresses a suspicion about another user intention',\n", + " 'Speaker3 offers a rebuttal, attempting to clarify the other user position and suggesting alternative perspectives',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a question, seeking clarification',\n", + " 'Speaker2 responds with a correction and justification, expressing certainty',\n", + " 'Speaker1 then questions the calculation, indicating confusion but also acknowledging a potential oversight',\n", + " 'Speaker3 enters the conversation to offer support for Speaker1 initial point, providing a detailed explanation',\n", + " 'Speaker1 then integrates the information, expressing uncertainty about the original statement',\n", + " 'Speaker3 offers a revised perspective',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with disagreement and expresses frustration regarding edits made by Speaker2',\n", + " 'Speaker1 defends their original writing, implying Speaker2 lacks specific knowledge',\n", + " 'Speaker2 responds politely, offering an alternative perspective and suggesting a different course of action',\n", + " 'Speaker1 persists in their disagreement, providing historical context to justify their original content and expressing displeasure with further edits',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 provides information',\n", + " 'Speaker2 challenges Speaker1 information with updated details',\n", + " 'Speaker2 defends their contribution, arguing it is a necessary correction to outdated information and not an advertisement',\n", + " 'Speaker3 enters the conversation, disagreeing with Speaker2 inclusion of a source',\n", + " 'Speaker3 provides counter-evidence to Speaker2 claims',\n", + " 'Speaker3 tone is critical, pointing out inconsistencies and offering alternative information',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 and Speaker2 began with disagreement',\n", + " 'Speaker2 responded defensively, intending to clarify their additions and emphasizing the importance of their sources',\n", + " 'Speaker1 rebutted, maintaining a critical stance and questioning the validity of Speaker2 sources',\n", + " 'Speaker1 denied censorship, attempting to clarify their reasoning',\n", + " 'Speaker1 then shifted to a more explanatory tone, pointing to other relevant information',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 initiates the conversation with a question, expressing confusion',\n", + " 'Speaker2 repeats Speaker1 statement',\n", + " 'Speaker3 offers a potential explanation in a neutral tone, while also expressing some uncertainty',\n", + " 'Speaker4 provides a more detailed explanation, seemingly disagreeing with the initial confusion',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 persists in their viewpoint, attempting to refute the others claims',\n", + " 'Speaker1 initiates the conversation with a direct correction',\n", + " 'Speaker2 perceives Speaker1s correction as impolite',\n", + " 'Speaker2 immediately rebuts Speaker1s assertion, disagreeing with their point',\n", + " 'Speaker2 repeats their rebuttal multiple times',\n", + " 'Speaker1 reiterates their original point with persistent disagreement',\n", + " 'The tone shifts from correction to defensive disagreement and persistent contradiction',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker3 and Speaker2 then align in disagreement with Speaker1',\n", + " 'Speaker1 initiates the conversation with a question, expressing uncertainty and seeking clarification',\n", + " 'Speaker2 responds by offering two possible approaches, suggesting a preference for a more practical method',\n", + " 'Speaker3 generally agrees with one of the approaches, providing additional information and identifying inconsistencies',\n", + " 'Speaker4 expresses gratitude and agreement, offering additional suggestions and information',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker2 responds defensively, expressing feeling unfairly treated',\n", + " 'Speaker2 attempts to justify their actions by appealing to shared understanding and past experiences',\n", + " 'Speaker1 proposes a change, providing justifications',\n", + " 'Speaker2 disagrees, defending the original term and its usage',\n", + " 'Speaker1 concedes slightly, offering a compromise while still advocating for their preferred term, providing supporting evidence',\n", + " 'Speaker1 expresses initial disbelief and seeks clarification',\n", + " 'Speaker2 responds by providing additional information and context to explain the initial point of confusion',\n", + " 'Speaker2 agrees with the need for clarity but raises a potential misinterpretation',\n", + " 'Speaker2 expresses agreement, leading to a resolution',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'Speaker3 accuses another user of disregarding established protocols and imposing their view unilaterally',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 uses sarcasm',\n", + " 'Speaker1 initiates the conversation with a critical tone',\n", + " 'Speaker1 questions the introduction',\n", + " 'Speaker1 implies a deviation from scientific accuracy',\n", + " 'Speaker1 expresses a desire for factual representation',\n", + " 'Speaker2 responds defensively',\n", + " 'Speaker2 employs a sarcastic remark',\n", + " 'Speaker2 shifts from a neutral disagreement to a slightly condescending stance',\n", + " 'Speaker2 adopts a subtly attacking stance',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 initiates the conversation with a question, expressing uncertainty',\n", + " 'Speaker2 responds with confirmation and clarification, adopting a slightly instructive tone',\n", + " 'Speaker3 then offers a correction and provides additional context, seemingly disagreeing with Speaker2 initial response',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 makes a suggestion.',\n", + " 'Speaker3 enters the conversation and proposes an alternative.',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 raises a potential issue, pointing out a possible error',\n", + " 'Speaker1 reiterates the same concern regarding another instance',\n", + " 'Speaker2 questions the existence of the problem, offering a counter-explanation',\n", + " 'Speaker3 intervenes, agreeing with Speaker1 implied concern about a citation error',\n", + " 'Speaker3 provides evidence to support Speaker1 concern',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 initiates the conversation with a question, expressing concern and disbelief',\n", + " 'Speaker2 responds with additional information and context',\n", + " 'Speaker2 introduces a potentially controversial viewpoint',\n", + " 'Speaker3 enters the conversation to strongly disagree with Speaker2',\n", + " 'Speaker3 accuses Speaker2 of expressing personal opinions and lacking evidence',\n", + " 'Speaker2 defends their position',\n", + " 'The tone becomes defensive and accusatory',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 politely disagrees, providing a rationale for their preferred approach',\n", + " 'Speaker3 then enters, pointing out a potential flaw in Speaker2s reasoning',\n", + " 'Speaker2 concedes the point and expresses willingness to compromise',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker3 enters, expressing strong disagreement',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker3 disagrees',\n", + " 'The conversation begins with a question',\n", + " 'There is a disagreement',\n", + " 'Speaker1 defends their position by providing links',\n", + " 'Speaker1 attempts a rebuttal',\n", + " 'Speaker2 re-enters to offer a concession',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 initiates the conversation with a question, expressing confusion about Speaker2 actions',\n", + " 'Speaker1 then directly questions the purpose of Speaker2 changes, expressing disagreement',\n", + " 'Speaker2 is dismissive',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 initiates the conversation with a request for help and expresses confusion',\n", + " 'Speaker2 responds helpfully, offering to mediate a dispute with another user',\n", + " 'Speaker3 joins, providing advice',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 initiates the conversation with multiple questions expressing confusion and requesting clarification',\n", + " 'Speakers 2 and 3 offer tentative suggestions',\n", + " 'Speaker1 expresses dissatisfaction, emphasizing the lack of clarity and hinting at deletion',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 proposes an edit',\n", + " 'The intention of both speakers appears to be defensive.',\n", + " 'Each speaker aims to justify a particular stance.',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 initiates the conversation by re-posting a request for feedback on a biography',\n", + " 'Speaker2 offers specific suggestions and identifies areas needing correction',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, questioning potential plagiarism',\n", + " 'Speaker2 responds defensively, offering an explanation and shifting blame to another source',\n", + " 'Speaker2 attempts to further clarify their position by repeating their explanation',\n", + " 'Speaker3 then echoes Speaker1 initial accusation, suggesting continued doubt or disagreement with Speaker2 defense',\n", + " 'The conversation is marked by accusation, defense, and a lack of immediate resolution',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 states a fact and provides a source',\n", + " 'Speaker2 expresses confusion and requests clarification',\n", + " 'Speaker3 enters the conversation to provide additional information and a source that seemingly resolves the initial uncertainty',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 initiates the conversation with a question, expressing confusion about Speaker2 categorization.',\n", + " 'Speaker2 responds, seeking clarification and implying the category is a work in progress.',\n", + " 'Speaker1 reiterates their confusion, questioning the criteria used for categorization.',\n", + " 'Speaker1 highlights what they perceive as an inconsistency.',\n", + " 'The tone remains relatively neutral, with both speakers primarily focused on seeking and providing clarification.',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation with an unclear statement',\n", + " 'Speaker1 expresses confusion and defensiveness',\n", + " 'Speaker1 attempts to justify a previous action',\n", + " 'Speaker1 appeals to a guideline',\n", + " 'The overall tone is initially defensive',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker3 uses sarcasm',\n", + " 'The conversation begins with a suggestion',\n", + " 'Speaker2 expresses disagreement and provides justification based on naming conventions',\n", + " 'Speaker2 acts decisively',\n", + " 'Speaker1 offers a rebuttal, citing common usage and questioning Speaker2 reasoning',\n", + " 'Speaker1 concedes a point but raises new objections',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 responds defensively',\n", + " 'Speaker1 corrects a minor detail',\n", + " 'Speaker1 expresses feeling targeted',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 initiates the conversation with a question, expressing a need for information',\n", + " 'Speaker2 restates the question',\n", + " 'Speaker2 provides an answer, indicating a helpful intention',\n", + " 'Speaker1 responds with gratitude, showing appreciation for the information provided',\n", + " 'Speaker3 adds further information, supplementing the previous answer and demonstrating a collaborative and informative intention',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker2 and Speaker3 agree on an initial point',\n", + " 'Speaker4 enters the conversation with disagreement',\n", + " 'Speaker4 defends their position',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker3 uses sarcasm',\n", + " 'Speaker2 offers an unsolicited opinion, expressing disagreement and identifying unreliable sources',\n", + " 'Speaker3 enters the conversation with disagreement, employing rhetorical questions and sarcasm',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 initiates the conversation with a direct question, seeking clarification for a reverted change',\n", + " 'Speaker2 responds politely, offering an assumption as justification',\n", + " 'Speaker1 expresses disagreement, providing a detailed explanation and pointing out inconsistencies, using a slightly sarcastic tone',\n", + " 'Speaker1 states they have reverted the change again, indicating persistent disagreement and a unilateral action',\n", + " 'The conversation appears unresolved, with Speaker2 not responding to the detailed explanation',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'There is a period of silence',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'The conversation begins with a request for clarification',\n", + " 'An explanation is provided',\n", + " 'There is initial disagreement regarding visual presentation',\n", + " 'Speaker2 defends their design choice',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 uses sarcasm',\n", + " 'Speaker1 points out an inconsistency',\n", + " 'Speaker1 provides supporting details',\n", + " 'Speaker2 repeats Speaker1 statement',\n", + " 'Speaker3 validates Speaker1 claim with external evidence',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation by disagreeing with Speaker2 removal of a sentence',\n", + " 'Speaker2 accuses Speaker1 of using a common tactic to discredit critics',\n", + " 'Speaker1 concedes on one point but maintains the importance of presenting facts',\n", + " 'The conversation involves persistent disagreement and defense of positions',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 initiates the conversation by disagreeing with a stated fact',\n", + " 'Speaker2 then politely requests evidence to support Speaker1 claim',\n", + " 'The tone remains civil and informative throughout the exchange, with both speakers demonstrating a willingness to engage constructively',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation with polite disagreement',\n", + " 'Speaker2 points to past criticisms of Speaker1 by others',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker2 responds assertively, disagreeing with Speaker1',\n", + " 'Speaker2 reverts Speaker1 changes',\n", + " 'Speaker3 supports Speaker2 position',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 initiates the conversation with a concerned and expectant tone',\n", + " 'Speaker2 responds with factual information, seemingly in agreement with the initial concern',\n", + " 'Speaker3 enters the conversation, providing additional details and context in a neutral tone',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker1 poses a question and provides data, seemingly seeking confirmation or explanation',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation with an inquiry, expressing confusion and requesting verification',\n", + " 'After a period of silence, Speaker1 reiterates their request',\n", + " 'Speaker2 responds with a detailed explanation, conceding an initial inaccuracy but defending their overall point with supporting evidence',\n", + " 'Speaker1 acknowledges their limited expertise, highlighting points of uncertainty and suggesting a compromise',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 politely suggests improvements',\n", + " 'Speaker2 responds defensively, justifying their actions',\n", + " 'Speaker3 enters, expressing disagreement with Speaker2s information',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'The conversation begins with a request for modification',\n", + " 'A disagreement and defense of an initial claim occurs',\n", + " 'Speaker1 challenges the basis of the claim',\n", + " 'Speaker2 provides further justification',\n", + " 'Speaker4 enters, offering evidence to support the initial claim',\n", + " 'Speaker2 continues to dispute the initial claim',\n", + " 'Speaker2 remains unconvinced',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'There is a period of silence',\n", + " 'Speaker1 initiates the conversation with a question that implies potential frustration or sarcasm',\n", + " 'Speaker2 responds by referencing a previous discussion and expressing agreement with a particular viewpoint',\n", + " 'Speaker1 then expresses disagreement and confusion, questioning Speaker2 action and stating a lack of supporting evidence',\n", + " 'Speaker2 then concedes with a brief acknowledgement',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 initiates the conversation with a sarcastic tone, questioning another user actions',\n", + " 'Speaker2 responds with a neutral sentiment, acknowledging Speaker1 point but also suggesting a more composed reaction',\n", + " 'Speaker1 then concedes to Speaker2 point, while also defending their initial motivation and expressing frustration that their attempt backfired',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 initiates the conversation with a defensive tone, asserting the superiority of their contribution',\n", + " 'Speaker2 responds with disagreement, employing a passive-aggressive approach by questioning the fairness of the situation and implying a conflict of interest',\n", + " 'Speaker3 then enters, offering a more neutral and balanced perspective, acknowledging both sides strengths and weaknesses',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker3 uses sarcasm',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 initiates the conversation with a request',\n", + " 'Speaker2 responds with disagreement, citing policy',\n", + " 'Speaker3 expresses confusion and disagreement with Speaker2 stance, using rhetorical questions',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation with an inquiry, expressing initial uncertainty',\n", + " 'Speaker2 responds with a hypothetical scenario, seeking clarification from Speaker1',\n", + " 'Speaker1 acknowledges an error and expresses agreement with Speaker2 suggestion',\n", + " 'Speaker1 then provides additional information to support the discussion',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 questions the necessity of a statement',\n", + " 'Speaker2 introduces a related point, expressing a sense of unfairness',\n", + " 'Speaker5 enters the conversation to disagree with Speaker2 initial point, asserting a counter-argument with conviction',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 makes an initial assertion',\n", + " 'Speaker2 expresses a suspicion about another user intention',\n", + " 'Speaker3 offers a rebuttal, attempting to clarify the other user position and suggesting alternative perspectives',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation with a question, seeking clarification',\n", + " 'Speaker2 responds with a correction and justification, expressing certainty',\n", + " 'Speaker1 then questions the calculation, indicating confusion but also acknowledging a potential oversight',\n", + " 'Speaker3 enters the conversation to offer support for Speaker1 initial point, providing a detailed explanation',\n", + " 'Speaker1 then integrates the information, expressing uncertainty about the original statement',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 defends themselves against perceived accusations',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation with disagreement and expresses frustration regarding edits made by Speaker2',\n", + " 'Speaker1 defends their original writing, implying Speaker2 lacks specific knowledge',\n", + " 'Speaker2 responds politely, offering an alternative perspective and suggesting a different course of action',\n", + " 'Speaker1 persists in their disagreement, providing historical context to justify their original content and expressing displeasure with further edits',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 provides information',\n", + " 'Speaker2 challenges Speaker1 information with updated details',\n", + " 'Speaker3 enters the conversation, disagreeing with Speaker2 inclusion of a source',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 and Speaker2 began with disagreement',\n", + " 'Speaker1 expressed a dismissive tone towards Speaker2 source',\n", + " 'Speaker2 responded defensively, intending to clarify their additions and emphasizing the importance of their sources',\n", + " 'Speaker1 rebutted, maintaining a critical stance and questioning the validity of Speaker2 sources',\n", + " 'Speaker2 persistently disagreed, expressing confusion and accusing Speaker1 of censorship',\n", + " 'Speaker1 denied censorship, attempting to clarify their reasoning',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker1 initiates the conversation with a question, expressing confusion',\n", + " 'Speaker2 repeats Speaker1 statement',\n", + " 'Speaker3 offers a potential explanation in a neutral tone, while also expressing some uncertainty',\n", + " 'Speaker3 then adds further information and shifts to a slightly accusatory tone, suggesting Speaker2 should moderate their tone based on a previous discussion',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 initiates the conversation with a direct correction',\n", + " 'Speaker2 immediately rebuts Speaker1s assertion, disagreeing with their point',\n", + " 'Speaker1 reiterates their original point with persistent disagreement',\n", + " 'The tone shifts from correction to defensive disagreement and persistent contradiction',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 initiates the conversation with a question, expressing uncertainty and seeking clarification',\n", + " 'Speaker2 responds by offering two possible approaches, suggesting a preference for a more practical method',\n", + " 'Speaker3 generally agrees with one of the approaches, providing additional information and identifying inconsistencies',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 initiates the conversation with an accusatory and frustrated tone, criticizing Speaker2 actions',\n", + " 'Speaker2 responds defensively, expressing feeling unfairly treated',\n", + " 'Speaker2 attempts to justify their actions by appealing to shared understanding and past experiences',\n", + " 'Speaker1 initiates the conversation with a question',\n", + " 'Speaker1 challenges a previous statement',\n", + " 'Speaker2 offers a defense',\n", + " 'Speaker1 rebuts with skepticism',\n", + " 'Speaker1 proposes an edit',\n", + " 'Speaker3 disagrees',\n", + " 'Speaker1 expresses continued skepticism but concedes temporarily',\n", + " 'Speaker1 reiterates their intention for accuracy',\n", + " 'Speaker1 expresses initial disbelief and seeks clarification',\n", + " 'Speaker2 responds by providing additional information and context to explain the initial point of confusion',\n", + " 'Speaker1 acknowledges the new information but suggests simplifying the language for better understanding',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker2 politely disagrees with Speaker1, referencing guidelines.',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 raises a potential issue, pointing out a possible error',\n", + " 'Speaker1 reiterates the same concern regarding another instance',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'Speaker3 accuses another user of disregarding established protocols and imposing their view unilaterally',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with a question, expressing concern and disbelief',\n", + " 'Speaker2 responds with additional information and context',\n", + " 'Speaker2 introduces a potentially controversial viewpoint',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with a suggestion',\n", + " 'Speaker2 politely disagrees, providing a rationale for their preferred approach',\n", + " 'Speaker3 then enters, pointing out a potential flaw in Speaker2s reasoning',\n", + " 'Speaker3 offers a further suggestion',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'Speaker3 accuses another user of disregarding established protocols and imposing their view unilaterally',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'The conversation begins with a request for contact',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'There is a disagreement',\n", + " 'Speaker2 re-enters to offer a concession',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with a request for help and expresses confusion',\n", + " 'Speaker2 responds helpfully, offering to mediate a dispute with another user',\n", + " 'Speaker3 joins, providing advice',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with multiple questions expressing confusion and requesting clarification',\n", + " 'Speakers 2 and 3 offer tentative suggestions',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker2 offers specific suggestions and identifies areas needing correction',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, questioning potential plagiarism',\n", + " 'Speaker2 responds defensively, offering an explanation and shifting blame to another source',\n", + " 'Speaker2 attempts to further clarify their position by repeating their explanation',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 states a fact and provides a source',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker1 initiates the conversation with an unclear statement',\n", + " 'Speaker1 expresses confusion and defensiveness',\n", + " 'Speaker2 responds with agreement',\n", + " 'The exchange then repeats verbatim',\n", + " 'The repetition indicates a possible technical issue or misunderstanding about the conversation flow',\n", + " 'The overall tone is initially defensive',\n", + " 'The tone shifts to supportive',\n", + " 'The tone becomes potentially confused due to the repetition',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with a concern, expressing frustration about potentially losing his work',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'Speaker3 accuses another user of disregarding established protocols and imposing their view unilaterally',\n", + " 'Speaker3 then announces the relocation of the discussion to a different forum',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker2 and Speaker3 agree on an initial point',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'Speaker3 accuses another user of disregarding established protocols and imposing their view unilaterally',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with an accusation and a threat of reporting',\n", + " 'Speaker2 offers an unsolicited opinion, expressing disagreement and identifying unreliable sources',\n", + " 'Speaker3 enters the conversation with disagreement, employing rhetorical questions and sarcasm',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with a direct question, seeking clarification for a reverted change',\n", + " 'Speaker2 responds politely, offering an assumption as justification',\n", + " 'Speaker1 states they have reverted the change again, indicating persistent disagreement and a unilateral action',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker3 welcomes Speaker2 input but defends the current version as a carefully considered compromise',\n", + " 'Speaker3 accuses another user of disregarding established protocols and imposing their view unilaterally',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'The conversation begins with a request for clarification',\n", + " 'An explanation is provided',\n", + " 'Speaker2 defends their design choice',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 points out an inconsistency',\n", + " 'Speaker1 provides supporting details',\n", + " 'Speaker3 acknowledges the confusion',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation by disagreeing with Speaker2 removal of a sentence',\n", + " 'Speaker2 rebuts with a detailed explanation, questioning the validity of Speaker1 sources and accusing them of presenting a biased viewpoint',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'Speaker2 responds politely, expressing reluctance to interfere but offering a suggestion for a more neutral phrasing',\n", + " 'Speaker1 informs Speaker2 of a sanction',\n", + " 'Speaker2 responds defensively, disagreeing with the accusation',\n", + " 'Speaker1 initiates the conversation by disagreeing with a stated fact',\n", + " 'Speaker1 initiates the conversation with an accusatory tone, suggesting an edit war',\n", + " 'The conversation appears to be an attempt to resolve a content dispute through collaborative editing',\n", + " 'Speaker1 initiates the conversation with polite disagreement',\n", + " ...]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster2_bulletpoints" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/convokit/convo_similarity/examples/example.ipynb b/convokit/convo_similarity/examples/example.ipynb new file mode 100644 index 00000000..a049999c --- /dev/null +++ b/convokit/convo_similarity/examples/example.ipynb @@ -0,0 +1,329 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example for using SCD Transformer and Compute with ConDynS\n", + "\n", + "We demonstrate here how to use SCD Transformer for writing SCDs with your custom prompts. Then, we show how to compute ConDynS." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-10-02 03:47:14.871250: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2025-10-02 03:47:14.950271: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2025-10-02 03:47:16.470796: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n" + ] + } + ], + "source": [ + "import os\n", + "from convokit import Corpus, download\n", + "from convokit.convo_similarity import SCD\n", + "from convokit.convo_similarity.condyns import ConDynS\n", + "from convokit.genai import GenAIConfigManager" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset already exists at /reef/kz88/convokit/download_corpus/friends-corpus\n" + ] + } + ], + "source": [ + "corpus = Corpus(filename=download(\"friends-corpus\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write SCD and SoP with SCD Transformer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### Config your GenAI API keys\n", + "config = GenAIConfigManager()\n", + "\n", + "# Set up Google Cloud configuration for Gemini (with Vertex AI)\n", + "# MODEL_PROVIDER = \"gemini\"\n", + "# MODEL = \"gemini-2.0-flash-001\"\n", + "# config.set_google_cloud_config(\"YOUR PROJECT\", \"YOUR LOCATION\")\n", + "\n", + "# Set up GPT configuration\n", + "MODEL_PROVIDER = \"gpt\"\n", + "MODEL = \"gpt-4o-mini\"\n", + "config.set_api_key(\"gpt\", \"YOUR API KEY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "### Define your own formatter function for your data\n", + "def format_friends_transcript_from_convokit(convo):\n", + " utt_lst = convo.get_utterance_ids()\n", + " speaker_ids = {}\n", + " transcript = \"\"\n", + " for utt_id in utt_lst:\n", + " utt = corpus.get_utterance(utt_id)\n", + " if \"TRANSCRIPT_NOTE\" not in utt.speaker.id:\n", + " if utt.speaker.id not in speaker_ids:\n", + " speaker_ids[utt.speaker.id] = 1 + len(speaker_ids)\n", + " transcript += \"Speaker\"+str(speaker_ids[utt.speaker.id]) + \" : \" + utt.text+ \"\\n\\n\"\n", + " return transcript" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "### Prepare your own prompt for writing the SCD with your data\n", + "friends_summary_prompt = \"\"\"\n", + "Write a short summary capturing the trajectory of a casual conversation. \n", + "Do not include specific topics, events, or arguments from the conversation. The style you should avoid is illustrated in \n", + "Example Sentence 1: “Speaker1 said they had a difficult day at work, and mentioned that their boss was unfair. Speaker2 listened and agreed that bosses can be tough, then suggested they go out for dinner to forget about it..” Instead, you should include indicators of sentiments (e.g., warmth, empathy, humor, nostalgia, vulnerability, support), individual intentions (e.g., building rapport, offering reassurance, seeking validation, self-disclosure, active listening, gentle disagreement, creating distance), and conversational strategies (if any) such as “collaborative storytelling,” “inside jokes,” “mirroring emotions,” and “affectionate teasing.” \n", + "The following sentences demonstrate the style you should follow: \n", + "Example Sentence 2: “Both speakers have similar feelings and appeared mutually supportive. Speaker1 initiates with a moment of self-disclosure, and Speaker2 responds with empathy and validation. Both speakers build on this exchange, strengthening their rapport.” \n", + "Example Sentence 3: “The two speakers connected with back-and-forth affectionate teasing. Throughout the conversation, they kept building on each other's humor with playful remarks, creating a lighthearted and comfortable discussion.” Overall, the trajectory summary should capture the key moments where the emotional connection of the conversation notably changes. Here is an example of a complete trajectory summary: The conversation begins with two speakers exchanging neutral, surface-level comments. Speaker1 then shifts the tone by sharing a personal anecdote, prompting Speaker2 to respond with warmth and empathy. Speaker1 elaborates on their story and their need, but Speaker2 does not extend their support but retracts it. \n", + "Now, provide the trajectory summary for the following conversation. \n", + "Conversation Transcript: {formatted_object}. \n", + "Now, summarize this conversation. Remember, do not include specific topics, claims, or arguments from the conversation. Instead, try to capture the speakers' sentiments, intentions, and conversational/persuasive strategies. Limit the trajectory summary to 80 words. \n", + "Trajectory Summary:\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "friends_sop_prompt = \"\"\"\n", + "Here is a trajectory summary of a conversation that lays out how the dynamics of the conversation developed. You need to parse the summary into events in order. \n", + "Follow the following guidelines:\n", + "1. Try to maintain the original language of the summary as much as you can. \n", + "2. Provide your output as a Python dictionary with the following structure:\n", + "_(Note: Do NOT use markdown, JSON formatting, or code block delimiters.)_ \n", + "{{\n", + " '0': \"\" // description of the event\n", + " '1': ...\n", + " ...\n", + "}}\n", + "Here is the summary:\n", + "{formatted_object}\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initialize your SCD transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "scd_transformer = SCD(\n", + " model_provider=MODEL_PROVIDER,\n", + " config=config,\n", + " model=MODEL,\n", + " custom_scd_prompt=friends_summary_prompt,\n", + " custom_sop_prompt=friends_sop_prompt,\n", + " custom_prompt_dir=\"friends_prompts\",\n", + " generate_scd=True,\n", + " generate_sop=True,\n", + " scd_metadata_name=\"machine_scd\",\n", + " sop_metadata_name=\"machine_sop\",\n", + " conversation_formatter=format_friends_transcript_from_convokit\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "conversation_ids = list(corpus.get_conversation_ids())[:2]\n", + "selector = lambda conv: conv.id in conversation_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = scd_transformer.transform(corpus, selector=selector)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCD: The conversation begins with playful teasing and lighthearted banter, creating a warm atmosphere. As one speaker expresses vulnerability, others respond with empathy and support, fostering a sense of camaraderie. The tone shifts to deeper emotional revelations, with moments of humor interspersed, allowing for self-disclosure and connection. Despite some tension, the group maintains a supportive dynamic, ultimately reinforcing their bonds through shared experiences and gentle encouragement, culminating in a mix of nostalgia and understanding.\n", + "SoP: {\n", + " '0': \"The conversation begins with playful teasing and lighthearted banter, creating a warm atmosphere.\",\n", + " '1': \"As one speaker expresses vulnerability, others respond with empathy and support, fostering a sense of camaraderie.\",\n", + " '2': \"The tone shifts to deeper emotional revelations, with moments of humor interspersed, allowing for self-disclosure and connection.\",\n", + " '3': \"Despite some tension, the group maintains a supportive dynamic.\",\n", + " '4': \"Ultimately reinforcing their bonds through shared experiences and gentle encouragement, culminating in a mix of nostalgia and understanding.\"\n", + "}\n" + ] + } + ], + "source": [ + "convo = corpus.get_conversation(conversation_ids[0])\n", + "print(\"SCD: \", convo.meta[\"machine_scd\"])\n", + "print(\"SoP: \", convo.meta[\"machine_sop\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute ConDynS Score\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "condyns = ConDynS(model_provider=MODEL_PROVIDER, \n", + " model=MODEL, \n", + " config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ConDynS Score between conversations s01_e01_c01_u001 and s01_e01_c02_u001: 0.6499999999999999\n", + "Score stored in conversation s01_e01_c01_u001 metadata: 0.6499999999999999\n", + "Score stored in conversation s01_e01_c02_u001 metadata: 0.6499999999999999\n", + "Score reasoning stored in conversation s01_e01_c01_u001 metadata: [{'0': {'analysis': 'Transcript starts with playful banter but lacks warmth.', 'score': 0.3}, '1': {'analysis': 'Some expressions of vulnerability are present, but empathy is minimal.', 'score': 0.4}, '2': {'analysis': 'Emotional revelations occur, but humor is not well interspersed.', 'score': 0.5}, '3': {'analysis': 'Tension is present, but support is inconsistent.', 'score': 0.4}, '4': {'analysis': 'Shared experiences are mentioned, but bonds are not strongly reinforced.', 'score': 0.3}}, {'0': {'analysis': 'Transcript starts with playful banter about dating, matching the first event.', 'score': 1}, '1': {'analysis': 'Speaker6 expresses vulnerability about a breakup, aligning with the second event.', 'score': 1}, '2': {'analysis': 'Speaker2 offers support to Speaker6, resembling reassurance and validation.', 'score': 0.6}, '3': {'analysis': 'Humor returns as the group shares personal stories, matching the third event.', 'score': 1}, '4': {'analysis': 'Camaraderie is evident throughout, balancing support and humor, fitting the last event.', 'score': 1}}]\n", + "Score reasoning stored in conversation s01_e01_c02_u001 metadata: [{'0': {'analysis': 'Transcript starts with playful banter but lacks warmth.', 'score': 0.3}, '1': {'analysis': 'Some expressions of vulnerability are present, but empathy is minimal.', 'score': 0.4}, '2': {'analysis': 'Emotional revelations occur, but humor is not well interspersed.', 'score': 0.5}, '3': {'analysis': 'Tension is present, but support is inconsistent.', 'score': 0.4}, '4': {'analysis': 'Shared experiences are mentioned, but bonds are not strongly reinforced.', 'score': 0.3}}, {'0': {'analysis': 'Transcript starts with playful banter about dating, matching the first event.', 'score': 1}, '1': {'analysis': 'Speaker6 expresses vulnerability about a breakup, aligning with the second event.', 'score': 1}, '2': {'analysis': 'Speaker2 offers support to Speaker6, resembling reassurance and validation.', 'score': 0.6}, '3': {'analysis': 'Humor returns as the group shares personal stories, matching the third event.', 'score': 1}, '4': {'analysis': 'Camaraderie is evident throughout, balancing support and humor, fitting the last event.', 'score': 1}}]\n" + ] + } + ], + "source": [ + "convo_id1 = conversation_ids[0]\n", + "convo_id2 = conversation_ids[1]\n", + "\n", + "# Compare conversations\n", + "result, condyns_score = condyns.compare_conversations(\n", + " corpus=corpus,\n", + " convo_id1=convo_id1, \n", + " convo_id2=convo_id2,\n", + " sop_meta_name=\"machine_sop\",\n", + " formatter=format_friends_transcript_from_convokit # Use our custom formatter\n", + ")\n", + "\n", + "print(f\"ConDynS Score between conversations {convo_id1} and {convo_id2}: {condyns_score}\")\n", + "\n", + "convo1 = corpus.get_conversation(convo_id1)\n", + "convo2 = corpus.get_conversation(convo_id2)\n", + "\n", + "score_key1 = f\"condyns_{convo_id1}_{convo_id2}\"\n", + "result_key1 = f\"condyns_result_{convo_id1}_{convo_id2}\"\n", + "score_key2 = f\"condyns_{convo_id2}_{convo_id1}\"\n", + "result_key2 = f\"condyns_result_{convo_id2}_{convo_id1}\"\n", + "print(f\"Score stored in conversation {convo_id1} metadata: {convo1.meta.get(score_key1)}\")\n", + "print(f\"Score stored in conversation {convo_id2} metadata: {convo2.meta.get(score_key2)}\")\n", + "\n", + "print(f\"Score reasoning stored in conversation {convo_id1} metadata: {convo1.meta.get(result_key1)}\")\n", + "print(f\"Score reasoning stored in conversation {convo_id2} metadata: {convo2.meta.get(result_key2)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/convokit/convo_similarity/examples/validation/baselines.ipynb b/convokit/convo_similarity/examples/validation/baselines.ipynb new file mode 100644 index 00000000..61fa3681 --- /dev/null +++ b/convokit/convo_similarity/examples/validation/baselines.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6248e97c", + "metadata": {}, + "source": [ + "# Validation Baselines\n", + "\n", + "This notebook implements and evaluates baseline similarity measures used for comparison with our ConDynS measure, where the results are demonstrated in the other notebook. It computes metrics such as SBERT cosine similarity, BERTScore, and naive LLM-prompted similarity on both transcript and SCD representations of conversations. These baselines serve as reference points in the validation experiment, allowing us to assess the unique contribution of ConDynS in capturing conversational dynamics beyond topic or surface-level features. Detailed discussion can be found in Section 5 of our [paper: A Similarity Measure for Comparing Conversational Dynamics](https://arxiv.org/abs/2507.18956)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d76cd50f", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from convokit import Corpus, download\n", + "import numpy as np\n", + "import scipy.stats as stats\n", + "import matplotlib.pyplot as plt\n", + "from tqdm import tqdm\n", + "\n", + "from convokit.convo_similarity.utils import get_human_summary_pair_lst" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99f0ac11", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = Corpus(filename=download(\"conversations-gone-awry-cmv-corpus\"))\n", + "corpus.print_summary_stats()\n", + "\n", + "human_pair_lst = get_human_summary_pair_lst(corpus)\n", + "convo_pairs = human_pair_lst + [(j, i) for i, j in human_pair_lst]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d684dcda", + "metadata": {}, + "outputs": [], + "source": [ + "ARTEFACTS_DIR = \"./artefacts/\"" + ] + }, + { + "cell_type": "markdown", + "id": "36be2341", + "metadata": {}, + "source": [ + "# Running All Baseline Methods\n", + "\n", + "The following script runs all baseline similarity metrics (e.g., SBERT cosine similarity, BERTScore, naive prompting) on the provided conversation pairs, with different input types (raw transcript of the conversation, or its SCD)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5df112e6", + "metadata": {}, + "outputs": [], + "source": [ + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations.json\", \"r\") as f:\n", + " transcript_simulations = json.load(f)\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations_topic_shuffled.json\", \"r\") as f:\n", + " transcript_simulations_topic_shuffled = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91d4c88d", + "metadata": {}, + "outputs": [], + "source": [ + "### Calling GPT to make naive comparisons, run with caution.\n", + "from convokit.convo_similarity.utils import format_transcript_from_convokit, get_human_summary\n", + "from convokit.convo_similarity.baseline import ConDynSBaselines\n", + "from convokit.genai.genai_config import GenAIConfigManager\n", + "\n", + "config = GenAIConfigManager() ## make sure to set your own config if this is never set before\n", + "MODEL_PROVIDER = \"gpt\"\n", + "MODEL = \"gpt-4o-mini\"\n", + "config.set_api_key(\"gpt\", \"YOUR API KEY\")\n", + "baselines = ConDynSBaselines(model_provider=\"gpt\", config=config)\n", + "\n", + "self_results = {}\n", + "self_scores = []\n", + "for convo_id1, convo_id2 in tqdm(convo_pairs, desc=\"Calculating self sim similarity\"):\n", + " transcript1 = \"\\n\\n\".join(format_transcript_from_convokit(corpus, convo_id1))\n", + " transcript2 = transcript_simulations[convo_id1]['generated_transcript']\n", + "\n", + " scd1 = get_human_summary(corpus, convo_id1)['summary_text']\n", + " scd2 = transcript_simulations[convo_id1]['summary']['summary_text']\n", + "\n", + " transcript_bertscore = baselines.get_bertscore(transcript1, transcript2)['f1'][0]\n", + " transcript_cos_sim = baselines.get_cosine_similarity(transcript1, transcript2)\n", + " transcript_naive_gpt, _ = baselines.get_naive_gpt_compare_score_Transcripts(transcript1, transcript2)\n", + "\n", + " scd_bertscore = baselines.get_bertscore(scd1, scd2)['f1'][0]\n", + " scd_cos_sim = baselines.get_cosine_similarity(scd1, scd2)\n", + " scd_naive_gpt, _ = baselines.get_naive_gpt_compare_score_SCDs(scd1, scd2)\n", + "\n", + " results = {\"transcript_bertscore\" : transcript_bertscore,\n", + " \"transcript_cos_sim\" : transcript_cos_sim,\n", + " \"transcript_naive_gpt\" : transcript_naive_gpt,\n", + " \"scd_bertscore\" : scd_bertscore,\n", + " \"scd_cos_sim\" : scd_cos_sim,\n", + " \"scd_naive_gpt\" : scd_naive_gpt}\n", + " self_results[str(convo_id1)] = results\n", + " self_scores.append(results)\n", + "\n", + "pair_results = {}\n", + "pair_scores = []\n", + "for convo_id1, convo_id2 in tqdm(convo_pairs, desc=\"Calculating pair sim similarity\"):\n", + " transcript1 = \"\\n\\n\".join(format_transcript_from_convokit(corpus, convo_id1))\n", + " transcript2 = transcript_simulations[convo_id2]['generated_transcript']\n", + " \n", + " scd1 = get_human_summary(corpus, convo_id1)['summary_text']\n", + " scd2 = transcript_simulations[convo_id2]['summary']['summary_text']\n", + "\n", + " transcript_bertscore = baselines.get_bertscore(transcript1, transcript2)['f1'][0]\n", + " transcript_cos_sim = baselines.get_cosine_similarity(transcript1, transcript2)\n", + " transcript_naive_gpt, _ = baselines.get_naive_gpt_compare_score_Transcripts(transcript1, transcript2)\n", + "\n", + " scd_bertscore = baselines.get_bertscore(scd1, scd2)['f1'][0]\n", + " scd_cos_sim = baselines.get_cosine_similarity(scd1, scd2)\n", + " scd_naive_gpt, _ = baselines.get_naive_gpt_compare_score_SCDs(scd1, scd2)\n", + "\n", + " results = {\"transcript_bertscore\" : transcript_bertscore,\n", + " \"transcript_cos_sim\" : transcript_cos_sim,\n", + " \"transcript_naive_gpt\" : transcript_naive_gpt,\n", + " \"scd_bertscore\" : scd_bertscore,\n", + " \"scd_cos_sim\" : scd_cos_sim,\n", + " \"scd_naive_gpt\" : scd_naive_gpt}\n", + " \n", + " pair_results[str(convo_id2)] = results\n", + " pair_scores.append(results)\n", + "\n", + "\n", + "all_baseline_scores = {\"self_results\" : self_results,\n", + " \"self_scores\" : self_scores,\n", + " \"pair_results\" : pair_results,\n", + " \"pair_scores\" : pair_scores}\n", + "\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/baseline/baseline_results.json\", \"w\") as f:\n", + " json.dump(all_baseline_scores, f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "476e867b", + "metadata": {}, + "outputs": [], + "source": [ + "with open(ARTEFACTS_DIR + \"validation_gpt/baseline/baseline_results.json\", \"r\") as f:\n", + " all_baseline_scores = json.load(f)\n", + "\n", + "self_scores = all_baseline_scores[\"self_scores\"]\n", + "pair_scores = all_baseline_scores[\"pair_scores\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6627a9f", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_baseline_scores(self_scores, pair_scores, input_type, score_method):\n", + " \"\"\"\n", + " Input:\n", + " input_type: transcript, scd\n", + " score_method: cos_sim, bertscore, naive_gpt\n", + " \"\"\"\n", + " accuracy = []\n", + " self_raw_scores = [x[f\"{input_type}_{score_method}\"] for x in self_scores]\n", + " pair_raw_scores = [x[f\"{input_type}_{score_method}\"] for x in pair_scores]\n", + " accuracy = [x > y for x, y in zip(self_raw_scores, pair_raw_scores)]\n", + "\n", + " print(sum(accuracy) / len(accuracy))\n", + " print(np.mean(self_raw_scores))\n", + " print(np.mean(pair_raw_scores))\n", + " print(stats.wilcoxon(self_raw_scores, pair_raw_scores))\n", + "\n", + " plt.hist(self_raw_scores, alpha = 0.6, label = \"self simulation\")\n", + " plt.hist(pair_raw_scores, alpha = 0.6, label = \"pair simulation\")\n", + " plt.xlabel(\"number of conversations\")\n", + " plt.ylabel(\"similarity scores\")\n", + " plt.title(f\"Baseline ({score_method}) Score distribution of sim vs sim ({input_type})\")\n", + " plt.legend()\n", + " plt.show()\n", + "\n", + "def get_baseline_acc(self_scores, pair_scores, input_type, score_method):\n", + " \"\"\"\n", + " Input:\n", + " input_type: transcript, scd\n", + " score_method: cos_sim, bertscore, naive_gpt\n", + " \"\"\"\n", + " accuracy = []\n", + " self_raw_scores = [x[f\"{input_type}_{score_method}\"] for x in self_scores]\n", + " pair_raw_scores = [x[f\"{input_type}_{score_method}\"] for x in pair_scores]\n", + " accuracy = [x > y for x, y in zip(self_raw_scores, pair_raw_scores)]\n", + "\n", + " return sum(accuracy) / len(accuracy)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75bea43e", + "metadata": {}, + "outputs": [], + "source": [ + "### Outputting the results from all baseline methods ###\n", + "input_types = [\"transcript\", \"scd\"]\n", + "score_methods = [\"cos_sim\", \"bertscore\", \"naive_gpt\"]\n", + "\n", + "for score_method in score_methods:\n", + " for input_type in input_types:\n", + " acc = get_baseline_acc(self_scores, pair_scores, input_type, score_method)\n", + " print(f\"###### {score_method} + {input_type} ######\")\n", + " print(f\"Acc: {acc:.4f}\\n\")\n", + " print()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/convokit/convo_similarity/examples/validation/validation.ipynb b/convokit/convo_similarity/examples/validation/validation.ipynb new file mode 100644 index 00000000..f26c9488 --- /dev/null +++ b/convokit/convo_similarity/examples/validation/validation.ipynb @@ -0,0 +1,714 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66f8cc3c", + "metadata": {}, + "source": [ + "# Validation Setup for ConDynS\n", + "\n", + "This notebook demonstrates the validation procedure for ConDynS, our similarity measure for comparing conversational dynamics, introduced in the [paper: A Similarity Measure for Comparing Conversational Dynamics](https://arxiv.org/abs/2507.18956). It constructs anchor–positive–negative triplets of conversations from Reddit, where positives share similar dynamics with the anchor and negatives differ, and evaluates how well ConDynS distinguishes them relative to baseline similarity measures (e.g., SBERT cosine similarity, BERTScore), as demonstrate in the other demo notebook. This notebook follows the methodology described in the paper." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb45738", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from convokit import Corpus, download\n", + "from tqdm import tqdm\n", + "import numpy as np\n", + "import scipy.stats as stats\n", + "import random\n", + "random.seed(4300)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b63ee87a", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = Corpus(filename=download(\"conversations-gone-awry-cmv-corpus\"))\n", + "corpus.print_summary_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c8f4f99", + "metadata": {}, + "outputs": [], + "source": [ + "### Get the human and machine summary ids ###\n", + "human_summary_ids = corpus.get_conversation_ids(selector=lambda conversation: conversation.meta[\"summary_meta\"] != []\n", + "and any(summary_meta[\"summary_type\"] == \"human_written_SCD\" for summary_meta in conversation.meta[\"summary_meta\"]))\n", + "machine_summary_ids = corpus.get_conversation_ids(selector=lambda conversation: conversation.meta[\"summary_meta\"] != []\n", + " and any(summary_meta[\"summary_type\"] == \"machine_generated_SCD\" for summary_meta in conversation.meta[\"summary_meta\"]))\n", + "pair_of = {}\n", + "for convo_id in human_summary_ids:\n", + " convo = corpus.get_conversation(convo_id)\n", + " pair_of[convo.id] = convo.meta['pair_id']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfcd36d7", + "metadata": {}, + "outputs": [], + "source": [ + "### Get pair info ###\n", + "human_summary_pair = [] # (calm, awry) \n", + "for convo_id in human_summary_ids:\n", + " convo = corpus.get_conversation(convo_id)\n", + " if convo.meta['has_removed_comment']:\n", + " if (convo.meta['pair_id'],convo.id) not in human_summary_pair:\n", + " human_summary_pair.append((convo.meta['pair_id'],convo.id))\n", + " else:\n", + " if (convo.id, convo.meta['pair_id']) not in human_summary_pair:\n", + " human_summary_pair.append((convo.meta['pair_id'],convo.id))\n", + "print(\"Number of conversation pair: \", len(human_summary_pair))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa7be53b", + "metadata": {}, + "outputs": [], + "source": [ + "ARTEFACTS_DIR = \"./artefacts/\"" + ] + }, + { + "cell_type": "markdown", + "id": "d5680368", + "metadata": {}, + "source": [ + "# ConDynS Validation\n", + "\n", + "Here we compute ConDynS on a subset of Reddit conversations with constructed triplets to validate the measure's usefulness in capturing and comparing conversational dynamics (discussed in detail in paper Section 5). The followings are steps to conduct the validation setup." + ] + }, + { + "cell_type": "markdown", + "id": "0bd4130a", + "metadata": {}, + "source": [ + "## Simulating Conversations\n", + "\n", + "To construct the triplets used for validating ConDynS (see Section 5 of the paper), we simulate synthetic conversations from human-written SCDs provided in the ConvoKit corpus. These SCDs abstract away surface content while preserving conversational dynamics. By generating conversations from these summaries, we can also assign new topics—allowing us to test whether ConDynS remains sensitive to dynamics while being invariant to topical changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc6dd235", + "metadata": {}, + "outputs": [], + "source": [ + "from convokit.convo_similarity.utils import format_transcript_from_convokit, get_human_summary\n", + "from convokit.genai import get_llm_client\n", + "from convokit.genai.genai_config import GenAIConfigManager\n", + "\n", + "config = GenAIConfigManager() ### make sure to set your own config if this is never set before\n", + "MODEL_PROVIDER = \"gpt\"\n", + "MODEL = \"gpt-4o-mini\"\n", + "config.set_api_key(\"gpt\", \"YOUR API KEY\")\n", + "client = get_llm_client(\"gpt\", config)\n", + "\n", + "def gpt_query(prompt, **kwargs):\n", + " response = client.generate(prompt, **kwargs)\n", + " return response.text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24e31604", + "metadata": {}, + "outputs": [], + "source": [ + "### Extract topic of the conversations ###\n", + "topic_msg = \"\"\"Here are two conversations of the same topic. Summarize the topic of the conversations in a concise phrase that accurately captures the main subject being discussed.\n", + "Here is the transcript of the first conversation:\n", + "{transcript1}\n", + "\n", + "Here is the transcript of the second conversation:\n", + "{transcript2}\n", + "\n", + "Now, write the topic of the conversation in a concise phrase:\n", + "\"\"\"\n", + "topic = {}\n", + "for calm_convo_id, awry_convo_id in tqdm(human_summary_pair):\n", + " calm_transcript = format_transcript_from_convokit(corpus, calm_convo_id)\n", + " awry_transcript = format_transcript_from_convokit(corpus, awry_convo_id)\n", + " query = topic_msg.format(transcript1 = '\\n'.join(calm_transcript), transcript2 = '\\n'.join(awry_transcript))\n", + " response = gpt_query(query)\n", + " topic[calm_convo_id] = response\n", + " topic[awry_convo_id] = response\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9b1ffef", + "metadata": {}, + "outputs": [], + "source": [ + "### Simulate transcript ###\n", + "simulation_msg = \"\"\"You are given a task to recreate an online conversation that occured on reddit. Here is a list of information you are given.\n", + "1. Topic of the conversation: {topic}\n", + "2. Trajectory summary that summarizes the conversational and speakers' dynamics: {trajectory_summary}\n", + "\n", + "Each utterance of the transcript should be formatted as the following:\n", + "Speaker_ID (e.g. \"SPEAKER2\") : [Added text of the utterance]\n", + "\n", + "\n", + "#Output\n", + "Add your recreated conversation. Only generate the transcript of the conversation. \n", + "\"\"\"\n", + "generated_transcripts = {}\n", + "for calm_convo_id, awry_convo_id in tqdm(human_summary_pair):\n", + " calm_human_summary = get_human_summary(corpus, calm_convo_id)\n", + " awry_human_summary = get_human_summary(corpus, awry_convo_id)\n", + " calm_query = simulation_msg.format(topic=topic[calm_convo_id],trajectory_summary=calm_human_summary['summary_text'])\n", + " calm_response = gpt_query(calm_query)\n", + " generated_transcripts[calm_convo_id] = calm_response\n", + " awry_query = simulation_msg.format(topic=topic[awry_convo_id],trajectory_summary=awry_human_summary['summary_text'])\n", + " awry_response = gpt_query(awry_query)\n", + " generated_transcripts[awry_convo_id] = awry_response\n", + "\n", + "output = {}\n", + "for convo_id in generated_transcripts:\n", + " output[convo_id] = {\n", + " 'transcript': generated_transcripts[convo_id],\n", + " 'topic': topic[convo_id]\n", + " }\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations.json\", \"w\") as f:\n", + " json.dump(output, f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07e01407", + "metadata": {}, + "outputs": [], + "source": [ + "### Topic shuffle transcript simulation ###\n", + "topic_set = []\n", + "for i, (calm_convo_id, awry_convo_id) in enumerate(human_summary_pair):\n", + " topic_set.append(generated_transcripts[human_summary_pair[(i) % len(human_summary_pair)][0]]['topic'])\n", + "\n", + "new_topic = {}\n", + "for i, (calm_convo_id, awry_convo_id) in enumerate(human_summary_pair):\n", + " new_topic[calm_convo_id] = random.choice(topic_set)\n", + " new_topic[awry_convo_id] = random.choice(topic_set)\n", + "for convo_id in new_topic:\n", + " assert new_topic[convo_id] != generated_transcripts[convo_id]['topic']\n", + "assert len(new_topic) == len(generated_transcripts)\n", + "\n", + "generated_transcripts_topic_shuffled = {}\n", + "for calm_convo_id, awry_convo_id in tqdm(human_summary_pair):\n", + " calm_human_summary = get_human_summary(corpus, calm_convo_id)\n", + " awry_human_summary = get_human_summary(corpus, awry_convo_id)\n", + " calm_query = simulation_msg.format(topic=new_topic[calm_convo_id],trajectory_summary=calm_human_summary['summary_text']) #Adding new topic \n", + " calm_response = gpt_query(calm_query)\n", + " generated_transcripts_topic_shuffled[calm_convo_id] = calm_response\n", + " awry_query = simulation_msg.format(topic=new_topic[awry_convo_id],trajectory_summary=awry_human_summary['summary_text']) #Adding new topic\n", + " awry_response = gpt_query(awry_query)\n", + " generated_transcripts_topic_shuffled[awry_convo_id] = awry_response\n", + "\n", + "output = {}\n", + "for convo_id in generated_transcripts_topic_shuffled:\n", + " output[convo_id] = {\n", + " 'generated_transcript': generated_transcripts_topic_shuffled[convo_id],\n", + " 'topic': new_topic[convo_id]\n", + " }\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations_topic_shuffled.json\", \"w\") as f:\n", + " json.dump(output, f, indent=4)" + ] + }, + { + "cell_type": "markdown", + "id": "e95aecba", + "metadata": {}, + "source": [ + "## Writing SCDs and SoPs\n", + "\n", + "Now we generate the Summaries of Conversational Dynamics (SCDs) and extracts their corresponding Sequences of Patterns (SoPs), which are required inputs for computing the ConDynS score. The SCDs provide high-level abstractions of conversational flow, while the SoPs capture the ordered interaction patterns needed for alignment. These representations are prepared for both real and simulated conversations to ensure consistency during the validation procedure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fb20680", + "metadata": {}, + "outputs": [], + "source": [ + "from convokit.convo_similarity.scd import SCD\n", + "scd_transformer_gpt = SCD(model_provider=MODEL_PROVIDER, model=MODEL, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a161155", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a selector for the conversations we want to process\n", + "def validation_selector(conversation):\n", + " return conversation.id in pair_of\n", + "\n", + "# Transform the corpus to generate SCDs and SoPs\n", + "scd_transformer_gpt.transform(corpus, selector=validation_selector)\n", + "\n", + "# Extract results\n", + "scd = {}\n", + "bulletpoints = {}\n", + "for convo_id in pair_of:\n", + " convo = corpus.get_conversation(convo_id)\n", + " scd[convo_id] = convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoints[convo_id] = convo.meta.get(\"machine_sop\", \"\")\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/scd_og.json\", 'w') as f:\n", + " json.dump(scd, f, indent=4)\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/sop_og.json\", 'w') as f:\n", + " json.dump(bulletpoints, f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ce24e56", + "metadata": {}, + "outputs": [], + "source": [ + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations.json\", \"r\") as f:\n", + " simulated_transcripts = json.load(f)\n", + "\n", + "# For simulated transcripts, we need to use the transformer differently\n", + "# since they're not in the corpus. We'll create temporary conversations.\n", + "\n", + "from convokit import Conversation, Utterance, Speaker\n", + "import re\n", + "\n", + "def create_temp_conversation_from_transcript(transcript, convo_id):\n", + " \"\"\"Create a temporary ConvoKit conversation from a transcript string.\"\"\"\n", + " lines = transcript.strip().split('\\n')\n", + " utterances = []\n", + " speakers = {}\n", + " \n", + " for i, line in enumerate(lines):\n", + " if ':' in line:\n", + " speaker_part, text = line.split(':', 1)\n", + " speaker_id = speaker_part.strip()\n", + " text = text.strip()\n", + " \n", + " if speaker_id not in speakers:\n", + " speakers[speaker_id] = Speaker(id=speaker_id)\n", + " \n", + " utt = Utterance(\n", + " id=f\"{convo_id}_sim_{i}\",\n", + " speaker=speakers[speaker_id],\n", + " conversation_id=f\"{convo_id}_sim\",\n", + " text=text\n", + " )\n", + " utterances.append(utt)\n", + " \n", + " return Conversation(id=f\"{convo_id}_sim\", utterances=utterances)\n", + "\n", + "sim_utterances = []\n", + "for convo_id in pair_of:\n", + " transcript = simulated_transcripts[convo_id]['transcript']\n", + " temp_convo = create_temp_conversation_from_transcript(transcript, convo_id)\n", + " sim_utterances.extend(temp_convo.iter_utterances())\n", + "\n", + "sim_corpus = Corpus(utterances=sim_utterances)\n", + "\n", + "scd_transformer_gpt.transform(sim_corpus)\n", + "\n", + "scd = {}\n", + "bulletpoints = {}\n", + "for convo_id in pair_of:\n", + " sim_convo = sim_corpus.get_conversation(f\"{convo_id}_sim\")\n", + " scd[convo_id] = sim_convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoints[convo_id] = sim_convo.meta.get(\"machine_sop\", \"\")\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/scd_sim.json\", 'w') as f:\n", + " json.dump(scd, f, indent=4)\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/sop_sim.json\", 'w') as f:\n", + " json.dump(bulletpoints, f, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e678550", + "metadata": {}, + "outputs": [], + "source": [ + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations_topic_shuffled.json\", \"r\") as f:\n", + " simulated_transcripts_topic_shuffled = json.load(f)\n", + "\n", + "# Create temporary corpus with topic-shuffled simulated conversations\n", + "sim_shuffled_utterances = []\n", + "for convo_id in pair_of:\n", + " transcript = simulated_transcripts_topic_shuffled[convo_id]['generated_transcript']\n", + " temp_convo = create_temp_conversation_from_transcript(transcript, convo_id)\n", + " # Use different conversation ID to avoid conflicts\n", + " for utt in temp_convo.iter_utterances():\n", + " utt.conversation_id = f\"{convo_id}_sim_shuffled\"\n", + " utt.id = utt.id.replace(\"_sim_\", \"_sim_shuffled_\")\n", + " temp_convo.id = f\"{convo_id}_sim_shuffled\"\n", + " sim_shuffled_utterances.extend(temp_convo.iter_utterances())\n", + "\n", + "sim_shuffled_corpus = Corpus(utterances=sim_shuffled_utterances)\n", + "\n", + "# Transform the topic-shuffled simulated corpus\n", + "scd_transformer_gpt.transform(sim_shuffled_corpus)\n", + "\n", + "scd = {}\n", + "bulletpoints = {}\n", + "for convo_id in pair_of:\n", + " sim_convo = sim_shuffled_corpus.get_conversation(f\"{convo_id}_sim_shuffled\")\n", + " scd[convo_id] = sim_convo.meta.get(\"machine_scd\", \"\")\n", + " bulletpoints[convo_id] = sim_convo.meta.get(\"machine_sop\", \"\")\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/scd_sim_topic_shuffled.json\", 'w') as f:\n", + " json.dump(scd, f, indent=4)\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/sop_sim_topic_shuffled.json\", 'w') as f:\n", + " json.dump(bulletpoints, f, indent=4)" + ] + }, + { + "cell_type": "markdown", + "id": "e029b312", + "metadata": {}, + "source": [ + "## Compute ConDynS Score\n", + "\n", + "Finally, we are now ready to compute the ConDynS scores between conversation pairs. Using the SoP from one conversation and the transcript of the other, we apply the alignment procedure described in the paper to quantify how similar their dynamics are.\n", + "\n", + "In this validation of our ConDynS measure, we compare ConDynS scores for each triplet (anchor, positive, negative, introduced in Section 5). The metric is expected to assign a higher similarity score to the anchor–positive pair (which shares dynamics) than to the anchor–negative pair (which differs in dynamics). Accuracy is computed as the proportion of triplets where this condition holds. As reported in Table 1 of the paper, ConDynS achieves substantially higher accuracy than baseline methods across same-topic, different-topic, and adversarial-topic conditions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b493eae", + "metadata": {}, + "outputs": [], + "source": [ + "from convokit.convo_similarity.condyns import ConDynS\n", + "condyns_gpt = ConDynS(model_provider=MODEL_PROVIDER, model=MODEL, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0b032bf", + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate_condyns_results(self_scores, pair_scores):\n", + " performance = []\n", + " for score1, score2 in zip(self_scores, pair_scores):\n", + " performance.append(score1 > score2)\n", + " print(\"Accuracy:\",sum(performance) / len(performance), f\"for {len(performance)} pairs\")\n", + " print(stats.wilcoxon(self_scores, pair_scores))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53e8cdb3", + "metadata": {}, + "outputs": [], + "source": [ + "### Load SCDs and SoPs ###\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/scd_og.json\", \"r\") as f:\n", + " scd_og = json.load(f)\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/sop_og.json\", \"r\") as f:\n", + " sop_og = json.load(f)\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/scd_sim.json\", \"r\") as f:\n", + " scd_sim = json.load(f)\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/sop_sim.json\", \"r\") as f:\n", + " sop_sim = json.load(f)\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/scd_sim_topic_shuffled.json\", \"r\") as f:\n", + " scd_sim_topic_shuffled = json.load(f)\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/sop_sim_topic_shuffled.json\", \"r\") as f:\n", + " sop_sim_topic_shuffled = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71a9fccf", + "metadata": {}, + "outputs": [], + "source": [ + "### Compute ConDynS with simulated transcripts ###\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations.json\", \"r\") as f:\n", + " simulated_transcripts = json.load(f)\n", + "\n", + "# For validation, we need to compare original conversations with simulated ones\n", + "# Since simulated conversations are in a different corpus, we need to handle this differently\n", + "\n", + "self_scores = []\n", + "self_results = {}\n", + "\n", + "for convo_id in tqdm(pair_of):\n", + " # Get original conversation transcript and SoP\n", + " transcript1 = \"\\n\\n\".join(format_transcript_from_convokit(corpus, convo_id))\n", + " transcript2 = simulated_transcripts[convo_id]['transcript']\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim[convo_id]\n", + " \n", + " # Use the lower-level compute_bidirectional_similarity method\n", + " results = condyns_gpt.compute_bidirectional_similarity(transcript1, transcript2, sop1, sop2)\n", + " self_results[convo_id] = results\n", + " self_scores.append(np.mean(condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "pair_scores = []\n", + "pair_results = {}\n", + "for convo_id in tqdm(pair_of):\n", + " transcript1 = \"\\n\\n\".join(format_transcript_from_convokit(corpus, convo_id))\n", + " transcript2 = simulated_transcripts[pair_of[convo_id]]['transcript']\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim[pair_of[convo_id]]\n", + " \n", + " results = condyns_gpt.compute_bidirectional_similarity(transcript1, transcript2, sop1, sop2)\n", + " pair_results[convo_id] = results\n", + " pair_scores.append(np.mean(condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "output = {\"self\" : self_results, \"pair\" : pair_results}\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/condyns_og-sim.json\", 'w') as f:\n", + " json.dump(output, f, indent=4)\n", + "\n", + "evaluate_condyns_results(self_scores, pair_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e66e669", + "metadata": {}, + "outputs": [], + "source": [ + "### Compute ConDynS with topic shuffled simulated transcripts ###\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations_topic_shuffled.json\", \"r\") as f:\n", + " simulated_transcripts_topic_shuffled = json.load(f)\n", + "\n", + "self_scores = []\n", + "self_results = {}\n", + "\n", + "for convo_id in tqdm(pair_of):\n", + " transcript1 = \"\\n\\n\".join(format_transcript_from_convokit(corpus, convo_id))\n", + " transcript2 = simulated_transcripts_topic_shuffled[convo_id]['generated_transcript']\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim_topic_shuffled[convo_id]\n", + " results = condyns_gpt.compute_bidirectional_similarity(transcript1, transcript2, sop1, sop2)\n", + " self_results[convo_id] = results\n", + " self_scores.append(np.mean(condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "pair_scores = []\n", + "pair_results = {}\n", + "for convo_id in tqdm(pair_of):\n", + " transcript1 = \"\\n\\n\".join(format_transcript_from_convokit(corpus, convo_id))\n", + " transcript2 = simulated_transcripts_topic_shuffled[pair_of[convo_id]]['generated_transcript']\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim_topic_shuffled[pair_of[convo_id]]\n", + " results = condyns_gpt.compute_bidirectional_similarity(transcript1, transcript2, sop1, sop2)\n", + " pair_results[convo_id] = results\n", + " pair_scores.append(np.mean(condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "output = {\"self\" : self_results, \"pair\" : pair_results}\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/condyns_og-sim_topic_shuffled.json\", 'w') as f:\n", + " json.dump(output, f, indent=4)\n", + "\n", + "evaluate_condyns_results(self_scores, pair_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dd6560d", + "metadata": {}, + "outputs": [], + "source": [ + "### Compute ConDynS with Adversarial simulated transcripts ###\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/condyns_og-sim.json\", 'r') as f:\n", + " sim_results = json.load(f)\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/condyns_og-sim_topic_shuffled.json\", 'r') as f:\n", + " topic_shuffle_results = json.load(f)\n", + "\n", + "self_results = topic_shuffle_results['self']\n", + "pair_results = sim_results['pair']\n", + "\n", + "self_scores = []\n", + "for convo_id in self_results:\n", + " results = self_results[convo_id]\n", + " self_scores.append(condyns_gpt.compute_score_from_results(results))\n", + "\n", + "pair_scores = []\n", + "for convo_id in pair_results:\n", + " results = pair_results[convo_id]\n", + " pair_scores.append(condyns_gpt.compute_score_from_results(results))\n", + "\n", + "evaluate_condyns_results(self_scores, pair_scores)" + ] + }, + { + "cell_type": "markdown", + "id": "d0332c40", + "metadata": {}, + "source": [ + "## ConDynS SoP to SoP Alignment\n", + "\n", + "Here we also include ConDynS computation with SoP-to-SoP alignment that is presented in the paper, where both conversations use their pattern sequences. This keeps order information but can miss overlapping patterns. This can serve as a comparison to highlight ConDynS’s benefit of combining SoP precision with transcript recall.\n", + "\n", + "Notice in the following code, we call it Naive ConDynS, because it is more \"naive\" comparing to our advanced ConDynS above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6606a104", + "metadata": {}, + "outputs": [], + "source": [ + "from convokit.convo_similarity.naive_condyns import NaiveConDynS\n", + "naive_condyns_gpt = NaiveConDynS(model_provider=MODEL_PROVIDER, model=MODEL, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b921ec3", + "metadata": {}, + "outputs": [], + "source": [ + "### Compute NaiveConDynS with simulated transcripts ###\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations.json\", \"r\") as f:\n", + " simulated_transcripts = json.load(f)\n", + "\n", + "self_scores = []\n", + "self_results = {}\n", + "\n", + "for convo_id in tqdm(pair_of):\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim[convo_id]\n", + " results = naive_condyns_gpt.compute_bidirectional_naive_condyns(sop1, sop2)\n", + " self_results[convo_id] = results\n", + " self_scores.append(np.mean(naive_condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "pair_scores = []\n", + "pair_results = {}\n", + "for convo_id in tqdm(pair_of):\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim[pair_of[convo_id]]\n", + " results = naive_condyns_gpt.compute_bidirectional_naive_condyns(sop1, sop2)\n", + " pair_results[convo_id] = results\n", + " pair_scores.append(np.mean(naive_condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "output = {\"self\" : self_results, \"pair\" : pair_results}\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/naive_condyns_og-sim.json\", 'w') as f:\n", + " json.dump(output, f, indent=4)\n", + "\n", + "evaluate_condyns_results(self_scores, pair_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee6b9638", + "metadata": {}, + "outputs": [], + "source": [ + "### Compute NaiveConDynS with topic shuffled simulated transcripts ###\n", + "with open(ARTEFACTS_DIR + \"validation_gpt/transcript_simulations_topic_shuffled.json\", \"r\") as f:\n", + " simulated_transcripts_topic_shuffled = json.load(f)\n", + "\n", + "self_scores = []\n", + "self_results = {}\n", + "\n", + "for convo_id in tqdm(pair_of):\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim_topic_shuffled[convo_id]\n", + " results = naive_condyns_gpt.compute_bidirectional_naive_condyns(sop1, sop2)\n", + " self_results[convo_id] = results\n", + " self_scores.append(np.mean(naive_condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "pair_scores = []\n", + "pair_results = {}\n", + "for convo_id in tqdm(pair_of):\n", + " sop1 = sop_og[convo_id]\n", + " sop2 = sop_sim_topic_shuffled[pair_of[convo_id]]\n", + " results = naive_condyns_gpt.compute_bidirectional_naive_condyns(sop1, sop2)\n", + " pair_results[convo_id] = results\n", + " pair_scores.append(np.mean(naive_condyns_gpt.compute_score_from_results(results)))\n", + "\n", + "output = {\"self\" : self_results, \"pair\" : pair_results}\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/naive_condyns_og-sim_topic_shuffled.json\", 'w') as f:\n", + " json.dump(output, f, indent=4)\n", + "\n", + "evaluate_condyns_results(self_scores, pair_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc6e7e8c", + "metadata": {}, + "outputs": [], + "source": [ + "### Compute NaiveConDynS with Adversarial simulated transcripts ###\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/naive_condyns_og-sim.json\", 'r') as f:\n", + " sim_results = json.load(f)\n", + "\n", + "with open(ARTEFACTS_DIR + f\"validation_gpt/naive_condyns_og-sim_topic_shuffled.json\", 'r') as f:\n", + " topic_shuffle_results = json.load(f)\n", + "\n", + "self_results = topic_shuffle_results['self']\n", + "pair_results = sim_results['pair']\n", + "\n", + "self_scores = []\n", + "for convo_id in self_results:\n", + " results = self_results[convo_id]\n", + " self_scores.append(naive_condyns_gpt.compute_score_from_results(results))\n", + "\n", + "pair_scores = []\n", + "for convo_id in pair_results:\n", + " results = pair_results[convo_id]\n", + " pair_scores.append(naive_condyns_gpt.compute_score_from_results(results))\n", + "\n", + "evaluate_condyns_results(self_scores, pair_scores)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "convokit", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/convokit/convo_similarity/naive_condyns.py b/convokit/convo_similarity/naive_condyns.py new file mode 100644 index 00000000..f0911279 --- /dev/null +++ b/convokit/convo_similarity/naive_condyns.py @@ -0,0 +1,261 @@ +import ast +import numpy as np +import os +import re + +try: + from convokit.genai import get_llm_client + + GENAI_AVAILABLE = True +except ImportError: + GENAI_AVAILABLE = False + + +class NaiveConDynS: + """A class to compute naive ConDynS score between two Sequences of Patterns. + + NaiveConDynS computes similarity scores between conversations by directly + comparing their Sequences of Patterns (SoP) without + using conversation transcripts. This provides a simpler approach to measuring + conversation dynamics similarity. + + :param model_provider: The LLM provider to use (e.g., "gpt", "gemini") + :param config: The GenAIConfigManager instance to use + :param model: Optional specific model name + :param custom_naive_condyns_prompt: Custom prompt for the naive condyns prompt template + :param custom_prompt_dir: Directory to save custom prompts (if not provided, overwrites defaults in ./prompts) + """ + + NAIVE_CONDYNS_PROMPT_TEMPLATE = None + + @classmethod + def _load_prompts(cls): + """Lazy load prompts into class variables. + + Loads the NaiveConDynS prompt template from the prompts directory if not already loaded. + """ + if cls.NAIVE_CONDYNS_PROMPT_TEMPLATE is None: + base_path = os.path.dirname(__file__) + with open( + os.path.join(base_path, "prompts/naive_condyns_prompt.txt"), "r", encoding="utf-8" + ) as f: + cls.NAIVE_CONDYNS_PROMPT_TEMPLATE = f.read() + + def __init__( + self, + model_provider: str, + config, + model: str = None, + custom_naive_condyns_prompt: str = None, + custom_prompt_dir: str = None, + ): + """Initialize the NaiveConDynS score computer with a specified model provider and optional model name. + + If no model is specified, defaults to our selected default model. + + :param model_provider: The LLM provider to use (e.g., "gpt", "gemini") + :param config: The GenAIConfigManager instance to use + :param model: Optional specific model name + :param custom_naive_condyns_prompt: Custom prompt for the naive condyns prompt template + :param custom_prompt_dir: Directory to save custom prompts (if not provided, overwrites defaults in ./prompts) + :raises ImportError: If genai dependencies are not available + """ + if not GENAI_AVAILABLE: + raise ImportError( + "GenAI dependencies not available. Please install via `pip install convokit[genai]`." + ) + self.model_provider = model_provider + self.config = config + self.model = model + self.custom_prompt_dir = custom_prompt_dir + + # Load default prompts first + self._load_prompts() + + # Override with custom prompts if provided + if custom_naive_condyns_prompt is not None: + self.NAIVE_CONDYNS_PROMPT_TEMPLATE = custom_naive_condyns_prompt + if custom_prompt_dir: + self._save_custom_prompt("naive_condyns_prompt.txt", custom_naive_condyns_prompt) + else: + self._save_custom_prompt_to_default( + "naive_condyns_prompt.txt", custom_naive_condyns_prompt + ) + + if model is not None: + self.client = get_llm_client(model_provider, config, model=model) + else: + self.client = get_llm_client(model_provider, config) + + def _save_custom_prompt(self, filename: str, prompt_content: str): + """Save custom prompt to the specified directory. + + :param filename: Name of the file to save + :param prompt_content: Content of the prompt to save + """ + if self.custom_prompt_dir: + os.makedirs(self.custom_prompt_dir, exist_ok=True) + filepath = os.path.join(self.custom_prompt_dir, filename) + with open(filepath, "w", encoding="utf-8") as f: + f.write(prompt_content) + + def _save_custom_prompt_to_default(self, filename: str, prompt_content: str): + """Save custom prompt to the default prompts directory. + + :param filename: Name of the file to save + :param prompt_content: Content of the prompt to save + """ + base_path = os.path.dirname(__file__) + filepath = os.path.join(base_path, "prompts", filename) + with open(filepath, "w", encoding="utf-8") as f: + f.write(prompt_content) + + def set_custom_naive_condyns_prompt(self, prompt_text: str, save_to_file: bool = True): + """Set a custom naive condyns prompt template. + + :param prompt_text: The custom prompt text + :param save_to_file: Whether to save the prompt to file in custom_prompt_dir or default prompts directory + """ + self.NAIVE_CONDYNS_PROMPT_TEMPLATE = prompt_text + if save_to_file: + if self.custom_prompt_dir: + self._save_custom_prompt("naive_condyns_prompt.txt", prompt_text) + else: + self._save_custom_prompt_to_default("naive_condyns_prompt.txt", prompt_text) + + def load_custom_prompts_from_directory(self, prompt_dir: str): + """Load custom prompts from a specified directory. + + :param prompt_dir: Directory containing custom prompt files + """ + naive_condyns_path = os.path.join(prompt_dir, "naive_condyns_prompt.txt") + + if os.path.exists(naive_condyns_path): + with open(naive_condyns_path, "r", encoding="utf-8") as f: + self.NAIVE_CONDYNS_PROMPT_TEMPLATE = f.read() + + def _clean_model_output_to_dict(self, text: str) -> dict: + """Clean and parse model output into a dictionary. + + Extracts dictionary content from model responses and handles common + formatting issues for safe parsing. + + :param text: Raw model output text + :return: Parsed dictionary from the model output + :raises ValueError: If no valid dictionary boundaries are found + """ + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end <= start: + raise ValueError("No valid dictionary boundaries found.") + + dict_str = text[start : end + 1] + dict_str = re.sub(r"'s\b", "s", dict_str) + dict_str = re.sub(r"'t\b", "t", dict_str) + dict_str = re.sub(r"'ve\b", "ve", dict_str) + return ast.literal_eval(dict_str) + + def compute_unidirectional_naive_condyns(self, sop1, sop2): + """Compute unidirectional naive conditional dynamics similarity between two Sequences of Patterns. + + Compares the SoPs from one conversation against another to measure how well + the dynamics of one conversation match those of another. + + :param sop1: SoP from the first conversation + :param sop2: SoP from the second conversation + :return: Dictionary with analysis and scores for each pattern in sop1 + """ + # Format the prompt with the two sequences of patterns + full_prompt = self.NAIVE_CONDYNS_PROMPT_TEMPLATE.format(sop1=sop1, sop2=sop2) + + response = self.client.generate(full_prompt) + try: + response_dict = self._clean_model_output_to_dict(response.text) + except (SyntaxError, ValueError) as e: + print(response.text) + print("Error parsing output:", e) + raise Exception("error parsing") + return response_dict + + def compute_bidirectional_naive_condyns(self, sop1, sop2): + """Compute bidirectional naive conditional dynamics similarity between two Sequences of Patterns. + + Computes similarity in both directions: sop1 vs sop2 and sop2 vs sop1 + to capture the full dynamics of both conversations. + + :param sop1: SoP from the first conversation + :param sop2: SoP from the second conversation + :return: List of [response_dict1, response_dict2] where each dict contains + analysis and scores for each pattern + """ + response_dict1 = self.compute_unidirectional_naive_condyns(sop1, sop2) + response_dict2 = self.compute_unidirectional_naive_condyns(sop2, sop1) + return [response_dict1, response_dict2] + + def measure_score(self, data): + """Calculate the mean score from a similarity result dictionary. + + :param data: Dictionary containing similarity analysis results + :return: Mean score across all patterns + """ + sum_score = [] + for item in data.values(): + sum_score.append(item["score"]) + return np.mean(sum_score) + + def compute_score_from_results(self, results): + """Compute scores from bidirectional similarity results. + + :param results: List of bidirectional similarity results + :return: List of mean scores for each direction + """ + scores = [] + for result in results: + scores.append(self.measure_score(result)) + return scores + + def compare_conversations(self, corpus, convo_id1: str, convo_id2: str, sop_meta_name: str): + """Compare two conversations using NaiveConDynS and store the result in both conversations' metadata. + + This method retrieves two conversations from the corpus, extracts their SoP data + from metadata, computes the NaiveConDynS score between them, and stores the result in both + conversations' metadata with the key format "condyns_{convo_id1}_{convo_id2}". + + Note: NaiveConDynS only uses SoP data for comparison, not conversation transcripts. + + :param corpus: The ConvoKit Corpus containing the conversations + :param convo_id1: ID of the first conversation + :param convo_id2: ID of the second conversation + :param sop_meta_name: Name of the metadata field containing SoP data + :return: The computed NaiveConDynS score + :raises KeyError: If conversations don't exist or required metadata is missing + :raises ValueError: If SoP data is malformed + """ + # Get conversations from corpus + try: + convo1 = corpus.get_conversation(convo_id1) + convo2 = corpus.get_conversation(convo_id2) + except KeyError as e: + raise KeyError(f"Conversation not found in corpus: {e}") + + # Extract SoP data from metadata + try: + sop1 = convo1.meta[sop_meta_name] + sop2 = convo2.meta[sop_meta_name] + except KeyError as e: + raise KeyError(f"SoP metadata '{sop_meta_name}' not found in conversation: {e}") + + # Compute bidirectional NaiveConDynS similarity + results = self.compute_bidirectional_naive_condyns(sop1, sop2) + + # Compute the mean score from bidirectional results + naive_condyns_score = np.mean(self.compute_score_from_results(results)) + + # Store the score in both conversations' metadata + score_key1 = f"condyns_{convo_id1}_{convo_id2}" + score_key2 = f"condyns_{convo_id2}_{convo_id1}" + + convo1.meta[score_key1] = naive_condyns_score + convo2.meta[score_key2] = naive_condyns_score + + return naive_condyns_score diff --git a/convokit/convo_similarity/prompts/condyns_prompt.txt b/convokit/convo_similarity/prompts/condyns_prompt.txt new file mode 100644 index 00000000..b3979da9 --- /dev/null +++ b/convokit/convo_similarity/prompts/condyns_prompt.txt @@ -0,0 +1,36 @@ +You will be given a transcript and a list of events describing conversational dynamic and trajectories. You are tasked with determining how closely a predefined sequence of dynamics is seen in a provided conversation transcript, both in occurrence and order. + +### Input: +- The sequence of events is provided as a dictionary, where: + - Keys: indicate the order of events, starting from '0'. + - Values: describe each event. + +### Task: +- Analysis: Analyze how closely a given transcript follows the sequence of described events. Think and analyze whether you see any part of the transcript resembles the event. Remember that the sequence of events also has to be considered. +- Similarity Score: Give a float score ranging from 0 to 1 based on your assessment of how closely the description of the trajectory. + - Order Penalty: If an event occurs before previous events (according to sequence keys), it should be scored significantly lower. + - Proximity of Events: Events in the transcript should closely follow the described sequence. If there are many unrelated events or long gaps between key events, the score should be penalized accordingly. + - Speaker Independence: The event can occur between any speakers, and the actual speaker names do not affect the analysis. +- Example: + - 0: No part of the transcript matches the described event at all. + - 0.35: A part resembles the described event but it occured couple utterances after the previous bullet point event. + - 0.6: A part resembles the described event. + - 1: A part exactly matches the described event explicitly and occurred either at the very first utterance or right after the previous event. + +### Output Format: +Provide your output as a Python dictionary with the following structure: +_(Note: Do NOT use markdown, JSON formatting, or code block delimiters. Do not use any " or ' in ANALYSIS. Do not quote in ANALYSIS. Do not use 's in ANALYSIS.)_ + +{{ + '0': {{'analysis': 'ANALYSIS (<=20 words)', 'score': i (0 <= i <= 1) }}, + '1': ... + ... +}} + +List of events: {events} + +Conversation: '{transcript}' + +Now, give your answer. Remember to follow the specified output format strictly, starting from 0 for the dictionary key. + +Output: \ No newline at end of file diff --git a/convokit/convo_similarity/prompts/naive_condyns_prompt.txt b/convokit/convo_similarity/prompts/naive_condyns_prompt.txt new file mode 100644 index 00000000..ff23d0b2 --- /dev/null +++ b/convokit/convo_similarity/prompts/naive_condyns_prompt.txt @@ -0,0 +1,36 @@ +You will be given two lists of events describing conversational dynamics and trajectories. You are tasked with determining how closely the first sequence of dynamics is seen in the second sequence of dynamics, both in occurrence and order. + +### Input: +- Both sequences of events is provided as a dictionary, where: + - Keys: indicate the order of events, starting from '0'. + - Values: describe each event. + +### Task: +- Analysis: Analyze how closely the first sequence of events follows the second sequence. Think and analyze whether you see any event in the second sequence resemble each event in the first. Remember that the order of events also has to be considered. +- Similarity Score: Give a float score ranging from 0 to 1 for each event in the first sequence based on your assessment of how closely the description of the trajectory. + - Order Penalty: If an event occurs before previous events (according to sequence keys), it should be scored significantly lower. + - Proximity of Events: The first sequence should closely follow the other. If there are many unrelated events between the matched key events, the score should be penalized accordingly. + - Speaker Independence: The event can occur between any speakers, and the actual speaker names do not affect the analysis. +- Example: + - 0: No event in the second sequence matches the described event of the first at all. + - 0.35: An event in the second sequence resembles the described event of the first sequence but it does not occur immediately after the previous bullet point event of the first. + - 0.6: A event resembles the described event of the first sequence + - 1: An event exactly matches the described event explicitly and is the very first event of the sequence or right after the previous event. + +### Output Format: +Provide your output as a Python dictionary with the following structure: +_(Note: Do NOT use markdown, JSON formatting, or code block delimiters. Do not use any " or ' in ANALYSIS. Do not quote in ANALYSIS. Do not use 's in ANALYSIS.)_ + +{ + '0': {'analysis': 'ANALYSIS (<=20 words)', 'score': i (0 <= i <= 1) }, + '1': ... + ... +} + +First sequence of events: {sop1} + +Second sequence of events: '{sop2}' + +Now, give your answer. Remember to follow the specified output format strictly. + +Output: diff --git a/convokit/convo_similarity/prompts/scd_prompt.txt b/convokit/convo_similarity/prompts/scd_prompt.txt new file mode 100644 index 00000000..7ac4ac16 --- /dev/null +++ b/convokit/convo_similarity/prompts/scd_prompt.txt @@ -0,0 +1,26 @@ +Write a short summary capturing the trajectory of an online conversation. +Do not include specific topics, claims, or arguments from the conversation. The style you should avoid: +Example Sentence 1: “Speaker1, who is Asian, defended Asians and pointed out that a study found that whites, Hispanics, and blacks were accepted into universities in that order, with Asians being accepted the least. Speaker2 acknowledged that Asians have high household income, but argued that this could be a plausible explanation for the study's findings. Speaker1 disagreed and stated that the study did not take wealth into consideration.” +This style mentions specific claims and topics, which are not needed. + +Instead, do include indicators of sentiments (e.g., sarcasm, passive-aggressive, polite, frustration, attack, blame), individual intentions (e.g., agreement, disagreement, persistent-agreement, persistent-disagreement, rebuttal, defense, concession, confusion, clarification, neutral, accusation) and conversational strategies (if any) such as 'rhetorical questions', 'straw man fallacy', 'identify fallacies', and 'appealing to emotions.' +The following sentences demonstrate the style you should follow: + +Example Sentence 2: “Both speakers have differing opinions and appeared defensive. Speaker1 attacks Speaker2 by diminishing the importance of his argument and Speaker2 blames Speaker1 for using profane words. Both speakers accuse each other of being overly judgemental of their personal qualities rather than arguments.” + +Example Sentence 3: “The two speakers refuted each other with back and forth accusations. Throughout the conversation, they kept harshly fault-finding with overly critical viewpoints, creating an intense and inefficient discussion.” + +Example Sentence 4: “Speaker1 attacks Speaker2 by questioning the relevance of his premise and Speaker2 blames Speaker1 for using profane words. Both speakers accuse each other of being overly judgemental of their personal qualities rather than arguments.” + +Overall, the trajectory summary should capture the key moments where the tension of the conversation notably changes. Here is an example of a complete trajectory summary. + +Trajectory Summary: +Multiple users discuss minimum wage. Four speakers express their different points of view subsequently, building off of each other’s arguments. Speaker1 disagrees with a specific point from Speaker2’s argument, triggering Speaker2 to contradict Speaker1 in response. Then, Speaker3 jumps into the conversation to support Speaker1’s argument, which leads Speaker2 to adamantly defend their argument. Speaker2 then quotes a deleted comment, giving an extensive counterargument. The overall tone remains civil. + +Now, provide the trajectory summary for the following conversation. +Conversation Transcript: +{formatted_object} + +Now, summarize this conversation. Remember, do not include specific topics, claims, or arguments from the conversation. Instead, try to capture the speakers' sentiments, intentions, and conversational/persuasive strategies. Limit the trajectory summary to 80 words. + +Trajectory Summary: \ No newline at end of file diff --git a/convokit/convo_similarity/prompts/sop_prompt.txt b/convokit/convo_similarity/prompts/sop_prompt.txt new file mode 100644 index 00000000..d6e7a5af --- /dev/null +++ b/convokit/convo_similarity/prompts/sop_prompt.txt @@ -0,0 +1,12 @@ +Here is a trajectory summary of a conversation that lays out how the dynamics of the conversation developed. You need to parse the summary into events in order. +Follow the following guidelines: +1. Try to maintain the original language of the summary as much as you can. +2. Provide your output as a Python dictionary with the following structure: +_(Note: Do NOT use markdown, JSON formatting, or code block delimiters.)_ +{{ + '0': "" // description of the event + '1': ... + ... +}} +Here is the summary: +{formatted_object} \ No newline at end of file diff --git a/convokit/convo_similarity/scd.py b/convokit/convo_similarity/scd.py new file mode 100644 index 00000000..c693b1c4 --- /dev/null +++ b/convokit/convo_similarity/scd.py @@ -0,0 +1,207 @@ +import os +import ast +import re +from typing import Callable, Optional, Union, Any, List +from convokit.transformer import Transformer +from convokit.model import Corpus, Conversation + +try: + from convokit.genai import LLMPromptTransformer + from convokit.genai.genai_config import GenAIConfigManager + + GENAI_AVAILABLE = True +except ImportError: + GENAI_AVAILABLE = False + + +class SCD(Transformer): + """ + A ConvoKit Transformer that generates Summary of Conversation Dynamics (SCD) and + Sequence of Patterns (SoP) for conversations in a corpus through a LLM. + + This transformer takes a corpus and generates SCD and/or SoP for selected conversations, + storing the results as metadata on the conversations. + + Prompt Templates: + - SCD prompt: Uses {formatted_output} placeholder for the conversation transcript + - SoP prompt: Uses {formatted_output} placeholder for the SCD summary + + :param model_provider: The LLM provider to use (e.g., "gpt", "gemini") + :param config: The GenAIConfigManager instance to use for LLM configuration + :param model: Optional specific model name + :param custom_scd_prompt: Custom text for the SCD prompt template. Should include {formatted_output} + placeholder for the conversation transcript. + :param custom_sop_prompt: Custom text for the SoP prompt template. Should include {formatted_output} + placeholder for the SCD summary. + :param custom_prompt_dir: Directory to save custom prompts + :param generate_scd: Whether to generate SCD summaries (default: True) + :param generate_sop: Whether to generate SoP patterns (default: True) + :param scd_metadata_name: Name for the SCD metadata field (default: "machine_scd") + :param sop_metadata_name: Name for the SoP metadata field (default: "machine_sop") + :param conversation_formatter: Optional function to format conversations for processing. + Should take a Conversation object and return a string. If None, uses default formatting. + :param llm_kwargs: Additional keyword arguments to pass to the LLM client + """ + + # Class variables for lazy loading of prompts + SUMMARY_PROMPT_TEMPLATE = None + BULLETPOINT_PROMPT_TEMPLATE = None + + @classmethod + def _load_prompts(cls): + """Lazy load prompts into class variables.""" + if cls.SUMMARY_PROMPT_TEMPLATE is None or cls.BULLETPOINT_PROMPT_TEMPLATE is None: + base_path = os.path.dirname(__file__) + with open( + os.path.join(base_path, "prompts/scd_prompt.txt"), "r", encoding="utf-8" + ) as f: + cls.SUMMARY_PROMPT_TEMPLATE = f.read() + with open( + os.path.join(base_path, "prompts/sop_prompt.txt"), "r", encoding="utf-8" + ) as f: + cls.BULLETPOINT_PROMPT_TEMPLATE = f.read() + + def __init__( + self, + model_provider: str, + config, + model: str = None, + custom_scd_prompt: str = None, + custom_sop_prompt: str = None, + custom_prompt_dir: str = None, + generate_scd: bool = True, + generate_sop: bool = True, + scd_metadata_name: str = "machine_scd", + sop_metadata_name: str = "machine_sop", + conversation_formatter: Optional[Callable[[Conversation], str]] = None, + llm_kwargs: Optional[dict] = None, + ): + if not GENAI_AVAILABLE: + raise ImportError( + "GenAI dependencies not available. Please install via `pip install convokit[genai]`." + ) + + self.model_provider = model_provider + self.config = config + self.model = model + self.custom_prompt_dir = custom_prompt_dir + self.generate_scd = generate_scd + self.generate_sop = generate_sop + self.scd_metadata_name = scd_metadata_name + self.sop_metadata_name = sop_metadata_name + self.conversation_formatter = conversation_formatter + self.llm_kwargs = llm_kwargs or {} + + # Load default prompts + self._load_prompts() + + # Set up prompts (use custom if provided) + self.scd_prompt = custom_scd_prompt or self.SUMMARY_PROMPT_TEMPLATE + self.sop_prompt = custom_sop_prompt or self.BULLETPOINT_PROMPT_TEMPLATE + + # Save custom prompts if provided + if custom_scd_prompt is not None: + self._save_custom_prompt("scd_prompt.txt", custom_scd_prompt) + if custom_sop_prompt is not None: + self._save_custom_prompt("sop_prompt.txt", custom_sop_prompt) + + def _save_custom_prompt(self, filename: str, prompt_content: str): + """Save custom prompt to the specified directory.""" + if self.custom_prompt_dir: + os.makedirs(self.custom_prompt_dir, exist_ok=True) + filepath = os.path.join(self.custom_prompt_dir, filename) + else: + base_path = os.path.dirname(__file__) + filepath = os.path.join(base_path, "prompts", filename) + + with open(filepath, "w", encoding="utf-8") as f: + f.write(prompt_content) + + def _default_conversation_formatter(self, conversation: Conversation) -> str: + """ + Default conversation formatter that creates a transcript from conversation utterances. + + :param conversation: The conversation to format + :return: Formatted transcript string + """ + utterances = conversation.get_chronological_utterance_list() + transcript_parts = [] + + for utt in utterances: + speaker_name = f"Speaker_{utt.speaker.id}" + transcript_parts.append(f"{speaker_name}: {utt.text}") + + return "\n".join(transcript_parts) + + def set_custom_scd_prompt(self, prompt_text: str, save_to_file: bool = True): + """Set a custom SCD prompt template.""" + self.scd_prompt = prompt_text + if save_to_file: + self._save_custom_prompt("scd_prompt.txt", prompt_text) + + def set_custom_sop_prompt(self, prompt_text: str, save_to_file: bool = True): + """Set a custom SoP prompt template.""" + self.sop_prompt = prompt_text + if save_to_file: + self._save_custom_prompt("sop_prompt.txt", prompt_text) + + def load_custom_prompts_from_directory(self, prompt_dir: str): + """Load custom prompts from a specified directory.""" + scd_path = os.path.join(prompt_dir, "scd_prompt.txt") + sop_path = os.path.join(prompt_dir, "sop_prompt.txt") + + if os.path.exists(scd_path): + with open(scd_path, "r", encoding="utf-8") as f: + self.scd_prompt = f.read() + + if os.path.exists(sop_path): + with open(sop_path, "r", encoding="utf-8") as f: + self.sop_prompt = f.read() + + def transform( + self, corpus: Corpus, selector: Callable[[Conversation], bool] = lambda x: True + ) -> Corpus: + """ + Transform the corpus by generating SCD and/or SoP for selected conversations. + + :param corpus: The target corpus + :param selector: A function that takes a Conversation object and returns True/False + to determine which conversations to process. By default, processes all conversations. + :return: The modified corpus with SCD/SoP metadata added to conversations + """ + if self.generate_scd: + formatter = self.conversation_formatter or self._default_conversation_formatter + scd_transformer = LLMPromptTransformer( + provider=self.model_provider, + model=self.model, + object_level="conversation", + prompt=self.scd_prompt, + formatter=formatter, + metadata_name=self.scd_metadata_name, + selector=selector, + config_manager=self.config, + llm_kwargs=self.llm_kwargs, + ) + scd_transformer.transform(corpus) + + if self.generate_sop: + # Formatter that gets the SCD from conversation metadata + def scd_formatter(conversation): + if self.scd_metadata_name not in conversation.meta: + raise ValueError(f"SCD not found for conversation {conversation.id}") + return conversation.meta.get(self.scd_metadata_name, "") + + sop_transformer = LLMPromptTransformer( + provider=self.model_provider, + model=self.model, + object_level="conversation", + prompt=self.sop_prompt, + formatter=scd_formatter, + metadata_name=self.sop_metadata_name, + selector=selector, + config_manager=self.config, + llm_kwargs=self.llm_kwargs, + ) + sop_transformer.transform(corpus) + + return corpus diff --git a/convokit/convo_similarity/utils.py b/convokit/convo_similarity/utils.py new file mode 100644 index 00000000..a342f6e0 --- /dev/null +++ b/convokit/convo_similarity/utils.py @@ -0,0 +1,355 @@ +import numpy as np +import matplotlib.pyplot as plt + + +def format_wiki_transcript_from_convokit(corpus, convo_id, truncated_by=0, start_at=0): + """Format a wiki conversation from convokit Wikipedia corpus. + + Converts a conversation from a ConvoKit corpus into a formatted transcript + suitable for wiki-style conversations, handling personal attacks and truncation. + + :param corpus: ConvoKit corpus containing the conversation + :param convo_id: ID of the conversation to format + :param truncated_by: Number of utterances to truncate from the end (default: 0) + :param start_at: Index to start from in the utterance list (default: 0) + :return: List of formatted transcript lines + """ + convo = corpus.get_conversation(convo_id) + utt_list = convo.get_chronological_utterance_list() + transcription = [] + spk_list = {} + if convo.meta["conversation_has_personal_attack"]: + utt_list = utt_list[: len(utt_list) - 1] + utt_list = utt_list[: len(utt_list) - truncated_by] + utt_list = utt_list[start_at:] + for utt in utt_list: + if utt.speaker.id not in spk_list.keys(): + spk_list[utt.speaker.id] = len(spk_list) + 1 + transcription.append("SPEAKER" + str(spk_list[utt.speaker.id]) + ": " + utt.text) + return transcription + + +def format_transcript_from_convokit(corpus, convo_id, truncated_by=3, start_at=0): + """Format a Reddit conversation from convokit Redditcorpus. + + Converts a conversation from a ConvoKit corpus into a formatted transcript + suitable for Reddit-style conversations, handling removed comments and truncation. + + :param corpus: ConvoKit corpus containing the conversation + :param convo_id: ID of the conversation to format + :param truncated_by: Number of utterances to truncate from the end (default: 3) + :param start_at: Index to start from in the utterance list (default: 0) + :return: List of formatted transcript lines + """ + convo = corpus.get_conversation(convo_id) + utt_list = convo.get_chronological_utterance_list() + transcription = [] + spk_list = {} + if convo.meta["has_removed_comment"]: + utt_list = utt_list[: len(utt_list) - 1] + utt_list = utt_list[: len(utt_list) - truncated_by] + utt_list = utt_list[start_at:] + for utt in utt_list: + if utt.speaker.id not in spk_list.keys(): + spk_list[utt.speaker.id] = len(spk_list) + 1 + transcription.append("SPEAKER" + str(spk_list[utt.speaker.id]) + ": " + utt.text) + return transcription + + +def format_transcript_from_convokit_utt_lst(corpus, utt_lst, truncated_by=3, start_at=0): + """Format a Reddit conversation from convokit Redditcorpus. + + Converts a conversation from a ConvoKit corpus into a formatted transcript + suitable for Reddit-style conversations, handling removed comments and truncation. + + :param corpus: ConvoKit corpus containing the conversation + :param convo_id: ID of the conversation to format + :param truncated_by: Number of utterances to truncate from the end (default: 3) + :param start_at: Index to start from in the utterance list (default: 0) + :return: List of formatted transcript lines + """ + utt_list = utt_lst + transcription = [] + spk_list = {} + utt_list = utt_list[: len(utt_list) - truncated_by] + utt_list = utt_list[start_at:] + for utt in utt_list: + if utt.speaker.id not in spk_list.keys(): + spk_list[utt.speaker.id] = len(spk_list) + 1 + transcription.append("SPEAKER" + str(spk_list[utt.speaker.id]) + ": " + utt.text) + return transcription + + +def format_transcript_from_convokit_delta( + corpus, convo_id, truncate_first_op_utt=True, truncate_last_op_utt=False +): + """Format a Reddit delta conversation from convokit Reddit corpus. + + Converts a conversation from a ConvoKit corpus into a formatted transcript + suitable for Reddit delta conversations, with options to truncate first/last utterances. + + :param corpus: ConvoKit corpus containing the conversation + :param convo_id: ID of the conversation to format + :param truncate_first_op_utt: Whether to remove the first utterance (default: True) + :param truncate_last_op_utt: Whether to remove the last utterance if it's from the same speaker (default: False) + :return: List of formatted transcript lines + """ + convo = corpus.get_conversation(convo_id) + utt_list = convo.get_chronological_utterance_list() + transcription = [] + spk_list = {utt_list[0].speaker.id: "SPEAKER1"} + for utt in utt_list: + if utt.speaker.id not in spk_list.keys(): + spk_list[utt.speaker.id] = "SPEAKER2" + assert len(spk_list) == 2 + transcription.append(spk_list[utt.speaker.id] + ": " + utt.text) + if truncate_first_op_utt: + transcription = transcription[1:] + if truncate_last_op_utt and utt_list[-1].speaker.id == utt_list[0].speaker.id: + transcription.pop() + return transcription + + +def get_human_summary(corpus, convo_id): + """Get the human written SCD of a conversation from a convokit corpus, if it exists. + + :param corpus: ConvoKit corpus containing the conversation + :param convo_id: ID of the conversation to get summary for + :return: Human written SCD metadata + :raises Exception: If the conversation does not have a human written summary + """ + convo = corpus.get_conversation(convo_id) + for summary in convo.meta["summary_meta"]: + if summary["summary_type"] == "human_written_SCD": + return summary + raise Exception("The conversation does not have any human written summary.") + + +def get_machine_summary(corpus, convo_id): + """Get the machine generated SCD of a conversation from a convokit corpus, if it exists. + + :param corpus: ConvoKit corpus containing the conversation + :param convo_id: ID of the conversation to get summary for + :return: Machine generated SCD metadata + :raises Exception: If the conversation does not have a machine generated summary + """ + convo = corpus.get_conversation(convo_id) + for summary in convo.meta["summary_meta"]: + if summary["summary_type"] == "machine_generated_SCD": + return summary + raise Exception("The conversation does not have any human written summary.") + + +def get_human_summary_pair_lst(corpus): + """Get the list of paired conversations and their human written SCDs. + + Finds all conversations in the corpus that have human written SCDs + and returns them as pairs. + + :param corpus: ConvoKit corpus to search for conversations with SCDs + :return: List of conversation pairs with human SCDs + """ + human_summary_ids = corpus.get_conversation_ids( + selector=lambda conversation: conversation.meta["summary_meta"] != [] + and any( + summary_meta["summary_type"] == "human_written_SCD" + for summary_meta in conversation.meta["summary_meta"] + ) + ) + human_summary_pair = [] # (calm, awry) + for convo_id in human_summary_ids: + convo = corpus.get_conversation(convo_id) + if convo.meta["has_removed_comment"]: + if (convo.meta["pair_id"], convo.id) not in human_summary_pair: + human_summary_pair.append((convo.meta["pair_id"], convo.id)) + else: + if (convo.id, convo.meta["pair_id"]) not in human_summary_pair: + human_summary_pair.append((convo.meta["pair_id"], convo.id)) + print("Number of conversation pair: ", len(human_summary_pair)) + return human_summary_pair + + +def get_pair_id(corpus, convo_id): + """Get the paired conversation's id of a conversation from a convokit corpus. + + :param corpus: ConvoKit corpus containing the conversation + :param convo_id: ID of the conversation to find pair for + :return: ID of the paired conversation + :raises Exception: If the conversation is not found in pairings + """ + human_summary_pair = get_human_summary_pair_lst(corpus) + for pair in human_summary_pair: + if convo_id in pair: + return pair[0] if convo_id == pair[1] else pair[1] + raise Exception("convo not found in pairings") + + +def count_yes_no(data): + """Count the number of yes and no judgements in a dictionary. + + :param data: Dictionary containing judgement data + :return: Tuple of (yes_count, no_count) + """ + yes_count = sum(1 for item in data.values() if item["judgement"] == "Yes") + no_count = sum(1 for item in data.values() if item["judgement"] == "No") + return yes_count, no_count + + +def measure_score(data): + """Measure the score of a conversation from a convokit corpus. + + Calculates the mean score from similarity analysis results. + + :param data: Dictionary containing similarity analysis results + :return: Mean score across all events + """ + sum_score = [] + for item in data.values(): + sum_score.append(item["score"]) + return np.mean(sum_score) + + +def summarize_statistics(lst, label): + """Summarize the statistics of a list of scores. + + Prints mean, median, and percentile statistics for a list of scores. + + :param lst: List of scores to analyze + :param label: Label to print before the statistics + """ + print(f"{label}") + print(f" Mean: {np.mean(lst):.2f}") + print(f" Median: {np.median(lst):.2f}") + print(f" 25th Percentile: {np.percentile(lst, 25):.2f}") + print(f" 75th Percentile: {np.percentile(lst, 75):.2f}") + + +def plot_numerical_summary(data_self, data_pair): + """Plot the numerical summary of a list of scores. + + Creates a scatter plot comparing two groups of scores with summary statistics. + + :param data_self: List of scores for the self group + :param data_pair: List of scores for the pair group + """ + summary_self = { + "mean": np.mean(data_self), + "median": np.median(data_self), + "percentile_25": np.percentile(data_self, 25), + "percentile_75": np.percentile(data_self, 75), + } + + summary_pair = { + "mean": np.mean(data_pair), + "median": np.median(data_pair), + "percentile_25": np.percentile(data_pair, 25), + "percentile_75": np.percentile(data_pair, 75), + } + + plt.figure(figsize=(12, 2)) + plt.scatter(data_self, [1] * len(data_self), color="blue", alpha=0.6, label="Self Group") + plt.scatter(data_pair, [0] * len(data_pair), color="green", alpha=0.6, label="Pair Group") + + plt.scatter( + list(summary_self.values()), + [1] * 4, + color="red", + marker="x", + s=100, + label="Self Summary Stats", + ) + plt.scatter( + list(summary_pair.values()), + [0] * 4, + color="orange", + marker="x", + s=100, + label="Pair Summary Stats", + ) + + plt.yticks([0, 1], ["Self Group", "Pair Group"]) + plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.1), ncol=2) + plt.grid(axis="x", linestyle="--", alpha=0.5) + + plt.tight_layout() + plt.show() + + +def evaluate(result): + """Evaluate the similarity of a conversation from a convokit corpus. + + Compares self-similarity and pair-similarity scores and provides statistical + analysis and visualization. + + :param result: Dictionary containing similarity results for conversations + :return: Tuple of (count, tied, total, convo_self_judgement_percent, convo_pair_judgement_percent) + """ + convo_self_judgement_percent = [] + convo_pair_judgement_percent = [] + + count, tied, total = 0, 0, 0 + + for convo_id, convo_result in result.items(): + total += 1 + score_self = measure_score(convo_result["self"]) + score_pair = measure_score(convo_result["pair"]) + + self_acc = score_self + pair_acc = score_pair + + if self_acc > pair_acc: + count += 1 + + if self_acc < pair_acc: + print(convo_id) + + convo_self_judgement_percent.append(self_acc) + convo_pair_judgement_percent.append(pair_acc) + + summarize_statistics(convo_self_judgement_percent, "Evaluating Self-Simulated Conversation") + print() + summarize_statistics(convo_pair_judgement_percent, "Evaluating Pair-Simulated Conversation") + plot_numerical_summary(convo_self_judgement_percent, convo_pair_judgement_percent) + return count, tied, total, convo_self_judgement_percent, convo_pair_judgement_percent + + +def evaluate_two(result1, result2): + """Evaluate the similarity of two conversations from a convokit corpus. + + Compares self-similarity and pair-similarity scores from two different results + and provides statistical analysis and visualization. + + :param result1: First dictionary containing similarity results for conversations + :param result2: Second dictionary containing similarity results for conversations + :return: Tuple of (count, tied, total, convo_self_judgement_percent, convo_pair_judgement_percent) + """ + convo_self_judgement_percent = [] + convo_pair_judgement_percent = [] + + count, tied, total = 0, 0, 0 + + for convo_id in result1: + total += 1 + score_self = measure_score(result1[convo_id]["self"]) + score_pair = measure_score(result1[convo_id]["pair"]) + + score_self_mirror = measure_score(result2[convo_id]["self"]) + score_pair_mirror = measure_score(result2[convo_id]["pair"]) + + self_acc = score_self + score_self_mirror + pair_acc = score_pair + score_pair_mirror + + if self_acc > pair_acc: + count += 1 + + if self_acc < pair_acc: + print(convo_id) + + convo_self_judgement_percent.append(self_acc) + convo_pair_judgement_percent.append(pair_acc) + + summarize_statistics(convo_self_judgement_percent, "Evaluating Self-Simulated Conversation") + print() + summarize_statistics(convo_pair_judgement_percent, "Evaluating Pair-Simulated Conversation") + plot_numerical_summary(convo_self_judgement_percent, convo_pair_judgement_percent) + return count, tied, total, convo_self_judgement_percent, convo_pair_judgement_percent diff --git a/convokit/genai/.gitignore b/convokit/genai/.gitignore new file mode 100644 index 00000000..94656643 --- /dev/null +++ b/convokit/genai/.gitignore @@ -0,0 +1 @@ +test.py \ No newline at end of file diff --git a/convokit/genai/__init__.py b/convokit/genai/__init__.py new file mode 100644 index 00000000..df178620 --- /dev/null +++ b/convokit/genai/__init__.py @@ -0,0 +1,37 @@ +from .base import LLMClient, LLMResponse +from .genai_config import GenAIConfigManager + +GPTClient = None +GeminiClient = None +LocalClient = None + +try: + from .gpt_client import GPTClient +except ImportError: + pass + +try: + from .gemini_client import GeminiClient +except ImportError: + pass + +try: + from .local_client import LocalClient +except ImportError: + pass + +from .factory import get_llm_client +from .llm_transformer import LLM +from .llmprompttransformer import LLMPromptTransformer + +__all__ = [ + "LLMClient", + "LLMResponse", + "GPTClient", + "GeminiClient", + "LocalClient", + "get_llm_client", + "GenAIConfigManager", + "LLM", + "LLMPromptTransformer", +] diff --git a/convokit/genai/base.py b/convokit/genai/base.py new file mode 100644 index 00000000..23754222 --- /dev/null +++ b/convokit/genai/base.py @@ -0,0 +1,51 @@ +from abc import ABC, abstractmethod +import time + + +class LLMResponse: + """Encapsulates the response from an LLM client. + + Contains the generated text, token usage information, latency, and raw response data. + + :param text: The generated text response from the LLM + :param tokens: Number of tokens used in the generation (may be -1 if not available from the LLM output) + :param latency: Time taken for generation in seconds + :param raw: Raw response object from the LLM + """ + + def __init__(self, text: str, tokens: int, latency: float, raw: dict): + self.text = text + self.tokens = tokens + self.latency = latency + self.raw = raw + + def __repr__(self): + return f"LLMResponse(text={self.text[:30]}..., tokens={self.tokens}, latency={self.latency:.2f}s)" + + +class LLMClient(ABC): + """Abstract base class for LLM clients. Used as a template for all LLM clients. + + Provides a common interface for different LLM providers (GPT, Gemini, local models, etc.). + All LLM clients should inherit from this class and implement the required methods. + """ + + @abstractmethod + def generate(self, messages, **kwargs) -> LLMResponse: + """Generate text from the LLM. + + :param messages: Input messages/prompt for the LLM. Can be a string or list of message dicts + :param **kwargs: Additional parameters for generation (temperature, max_tokens, etc.) + :return: LLMResponse object containing the generated text and metadata + """ + pass + + def stream(self, messages, callback, **kwargs): + """Stream text generation from the LLM. Notice that this is not supported yet. + + :param messages: Input messages/prompt for the LLM + :param callback: Function to call with each generated token/chunk + :param **kwargs: Additional parameters for generation + :raises NotImplementedError: If streaming is not supported by this client + """ + raise NotImplementedError("Streaming not supported yet.") diff --git a/convokit/genai/example/example.ipynb b/convokit/genai/example/example.ipynb new file mode 100644 index 00000000..ee67bc34 --- /dev/null +++ b/convokit/genai/example/example.ipynb @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d52b677a", + "metadata": {}, + "source": [ + "# Example of How to Use GenAI with ConvoKit genai Module\n", + "\n", + "The ConvoKit GenAI module provides a unified interface for working with large language models (LLMs) while doing conversational analysis in ConvoKit. It supports multiple providers including OpenAI GPT, Google Gemini, and local models through a simple factory pattern. This module makes it easy to integrate AI-powered text generation into your ConvoKit workflows for diverse tasks. The module handles API key management, response formatting, and provides consistent interfaces across different LLM providers.\n" + ] + }, + { + "cell_type": "markdown", + "id": "321da09c", + "metadata": {}, + "source": [ + "## Setup config for GenAI with GPT\n", + "\n", + "Setting up config info to access models is mandatory but simple. For models we implemented (GPT and Gemini), we provide methods to set API keys so they are stored in the environment. For other models or local LLMs, users can also implement them in similar manner. Here we provide a simple demonstration with configuring for OpenAI's GPT model." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "be24454d", + "metadata": {}, + "outputs": [], + "source": [ + "from convokit.genai.genai_config import GenAIConfigManager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5666dc72", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully set OpenAI API key in config.\n" + ] + } + ], + "source": [ + "config = GenAIConfigManager()\n", + "config.set_api_key(\"gpt\", \"YOUR API KEY\")\n", + "print(f\"Successfully set OpenAI API key in config.\")" + ] + }, + { + "cell_type": "markdown", + "id": "39dce4b7", + "metadata": {}, + "source": [ + "## Initialize clients to Communicate with models\n", + "\n", + "After setting the API key, we are ready to communicate with the models. Retrieve response the same as you would normally do interacting with models through API. However, we do wrap the LLM responses in a unified class, so we can handle all LLM response format easily. Users are expected to follow similar template when implementing clients for other models." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "220c33fe", + "metadata": {}, + "outputs": [], + "source": [ + "from convokit.genai import get_llm_client\n", + "\n", + "MODEL_PROVIDER = \"gpt\"\n", + "client = get_llm_client(MODEL_PROVIDER, config)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "5ea36378", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text: A fun fact about Cornell University is that it is home to the world's first university-based hotel management program. Established in 1922, the Cornell School of Hotel Administration has become a leader in hospitality education, attracting students from around the globe who aspire to careers in the hospitality industry. The program is renowned for its rigorous curriculum and strong connections to the industry, making it a top choice for aspiring hotel and restaurant managers.\n", + "Tokens: 99\n", + "Latency: 1.7928547859191895\n" + ] + } + ], + "source": [ + "response = client.generate([{\"role\": \"user\", \"content\": \"Tell me a fun fact about Cornell University.\"}])\n", + "print(\"Text:\", response.text)\n", + "print(\"Tokens:\", response.tokens)\n", + "print(\"Latency:\", response.latency)" + ] + }, + { + "cell_type": "markdown", + "id": "33b72969", + "metadata": {}, + "source": [ + "# Setup ConvoKit GenAI with Google Gemini Through Vertex AI\n", + "\n", + "Very similar to GPT. Checkout Vertex AI: https://cloud.google.com/vertex-ai/generative-ai/docs/start/quickstarts/quickstart-multimodal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbb38d6e", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_PROVIDER = \"gemini\"\n", + "MODEL = \"gemini-2.0-flash-001\"\n", + "config.set_google_cloud_config(\"YOUR PROJECT\", \"YOUR LOCATION\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "32fad57a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text: Here's a fun fact about Cornell University:\n", + "\n", + "Cornell has a tradition called \"Dragon Day\" where architecture students build a giant dragon and parade it across campus. The dragon is often pitted against a phoenix built by engineering students, leading to a playful rivalry and a spectacle of creativity and engineering!\n", + "\n" + ] + } + ], + "source": [ + "client = get_llm_client(MODEL_PROVIDER, config)\n", + "response = client.generate(\"Tell me a fun fact about Cornell University.\")\n", + "print(\"Text:\", response.text)" + ] + }, + { + "cell_type": "markdown", + "id": "499d983b", + "metadata": {}, + "source": [ + "# Using LLMPromptTransformer with ConvoKit\n", + "\n", + "The LLMPromptTransformer provides a powerful way to apply LLM processing to different levels of ConvoKit objects (utterances, conversations, speakers, or corpus). Let's demonstrate this with the friends corpus.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72195e24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset already exists at /reef/kz88/convokit/download_corpus/friends-corpus\n", + "First conversation ID: s01_e01_c01_u001\n", + "We'll process the first 2 utterances:\n", + " Utterance 1: Monica Geller: There's nothing to tell! He's just some guy I work with!...\n", + " Utterance 2: Joey Tribbiani: C'mon, you're going out with the guy! There's gotta be something wrong with him!...\n" + ] + } + ], + "source": [ + "from convokit import Corpus, download\n", + "from convokit.genai import LLMPromptTransformer\n", + "\n", + "corpus = Corpus(filename=download(\"friends-corpus\"))\n", + "\n", + "first_convo = corpus.get_conversation(corpus.get_conversation_ids()[0])\n", + "assert len(first_convo.get_utterance_ids()) > 2\n", + "utterances = first_convo.get_utterance_ids()[:2]\n", + "\n", + "print(f\"First conversation ID: {first_convo.id}\")\n", + "print(f\"We'll process the first 2 utterances:\")\n", + "for i, uttid in enumerate(utterances):\n", + " utt = corpus.get_utterance(uttid)\n", + " print(f\" Utterance {i+1}: {utt.speaker.id}: {utt.text[:100]}...\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "b33a7438", + "metadata": {}, + "source": [ + "## Example: Sentiment Analysis on Utterances\n", + "\n", + "Let's create a GenAI transformer that analyzes the sentiment of utterances and stores the result as metadata.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3618d385", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentiment analysis results:\n", + " Monica Geller: 'There's nothing to tell! He's just some guy I work...' -> Sentiment: Negative\n", + " Joey Tribbiani: 'C'mon, you're going out with the guy! There's gott...' -> Sentiment: Negative\n" + ] + } + ], + "source": [ + "sentiment_transformer = LLMPromptTransformer(\n", + " provider=\"gpt\",\n", + " model=\"gpt-4o-mini\",\n", + " object_level=\"utterance\", # Process at utterance level\n", + " prompt=\"Analyze the sentiment of the following text and respond with just one word: 'positive', 'negative', or 'neutral'. Text: {formatted_object}\",\n", + " formatter=lambda utterance: utterance.text,\n", + " metadata_name=\"gpt_sentiment\", # Store result in 'gpt_sentiment' metadata field\n", + " selector=lambda utterance: utterance.id in utterances,\n", + " config_manager=config\n", + ")\n", + "\n", + "corpus = sentiment_transformer.transform(corpus)\n", + "\n", + "print(\"Sentiment analysis results:\")\n", + "for uttid in utterances:\n", + " utt = corpus.get_utterance(uttid)\n", + " sentiment = utt.meta.get(\"gpt_sentiment\", \"Not processed\")\n", + " print(f\" {utt.speaker.id}: '{utt.text[:50]}...' -> Sentiment: {sentiment}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/convokit/genai/factory.py b/convokit/genai/factory.py new file mode 100644 index 00000000..c2763c23 --- /dev/null +++ b/convokit/genai/factory.py @@ -0,0 +1,50 @@ +GPTClient = None +GeminiClient = None +LocalClient = None + +try: + from .gpt_client import GPTClient +except ImportError: + pass + +try: + from .gemini_client import GeminiClient +except ImportError: + pass + +try: + from .local_client import LocalClient +except ImportError: + pass + + +def get_llm_client(provider: str, config_manager, **kwargs): + """Factory function as a unified interface to create LLM client instances. + + Creates and returns the appropriate LLM client based on the provider name. + The client is initialized with the config manager and any additional parameters. + + :param provider: Name of the LLM provider ("gpt", "gemini", "local") + :param config_manager: Configuration manager instance to pass to the client + :param **kwargs: Additional parameters to pass to the client constructor + :return: Initialized LLM client instance + :raises ValueError: If the provider is not supported or dependencies are missing + """ + if provider.lower() == "gpt": + if GPTClient is None: + raise ValueError("GPT client not available. Please install the 'openai' package.") + return GPTClient(config_manager=config_manager, **kwargs) + elif provider.lower() == "gemini": + if GeminiClient is None: + raise ValueError( + "Gemini client not available. Please install the 'google-genai' package." + ) + return GeminiClient(config_manager=config_manager, **kwargs) + elif provider.lower() == "local": + if LocalClient is None: + raise ValueError( + "Local client not available. Please install required dependencies for local model support." + ) + return LocalClient(config_manager=config_manager, **kwargs) + else: + raise ValueError(f"Unsupported provider: {provider}") diff --git a/convokit/genai/gemini_client.py b/convokit/genai/gemini_client.py new file mode 100644 index 00000000..0a6ae42b --- /dev/null +++ b/convokit/genai/gemini_client.py @@ -0,0 +1,87 @@ +import os +from google import genai +from google.genai.types import GenerateContentConfig, HttpOptions +from .base import LLMClient, LLMResponse +from .genai_config import GenAIConfigManager +import time + + +class GeminiClient(LLMClient): + """Client for interacting with Google Gemini models via Vertex AI. + + This client is configured to use Vertex AI and requires Google Cloud project and location + to be set. Configuration can be provided via the GenAI config system or environment variables. + + :param model: Name of the Gemini model to use (default: "gemini-2.0-flash-001") + :param config_manager: GenAIConfigManager instance (optional, will create one if not provided) + """ + + def __init__( + self, + model: str = "gemini-2.0-flash-001", + config_manager: GenAIConfigManager = None, + ): + if config_manager is None: + config_manager = GenAIConfigManager() + + self.config_manager = config_manager + + # Get required Vertex AI configuration + google_cloud_project = config_manager.get_google_cloud_project() + google_cloud_location = config_manager.get_google_cloud_location() + + # Validate required fields + if not google_cloud_project: + raise ValueError( + "Google Cloud project is required for Vertex AI. " + "Set it using config_manager.set_google_cloud_config(project, location) " + "or via GOOGLE_CLOUD_PROJECT environment variable." + ) + + if not google_cloud_location: + raise ValueError( + "Google Cloud location is required for Vertex AI. " + "Set it using config_manager.set_google_cloud_config(project, location) " + "or via GOOGLE_CLOUD_LOCATION environment variable." + ) + + # Set up Vertex AI environment + os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "true" + os.environ["GOOGLE_CLOUD_PROJECT"] = google_cloud_project + os.environ["GOOGLE_CLOUD_LOCATION"] = google_cloud_location + + self.client = genai.Client(http_options=HttpOptions(api_version="v1")) + self.model = model + + def generate(self, prompt, temperature=0.0, times_retried=0) -> LLMResponse: + """Generate text using the Gemini model. + + Sends a prompt to the Gemini model and returns the generated response. The function includes + retry logic for API errors and handles different input formats. + + :param prompt: Input prompt for generation + :param temperature: Sampling temperature for generation (default: 0.0) + :param times_retried: Number of retry attempts made so far (for internal use) + :return: LLMResponse object containing the generated text and metadata + :raises Exception: If retry attempts are exhausted + """ + start = time.time() + retry_after = 10 + + try: + response = self.client.models.generate_content( + model=self.model, + contents=prompt, + config=GenerateContentConfig(temperature=temperature), + ) + except Exception as e: + if times_retried >= 3: + raise Exception("Retry failed after multiple attempts.") from e + print(f"Gemini Exception: {e}. Retrying in {retry_after}s...") + time.sleep(retry_after) + return self.generate(prompt, temperature, times_retried + 1) + + elapsed = time.time() - start + text = response.text + # Gemini does not currently provide token usage reliably + return LLMResponse(text=text, tokens=-1, latency=elapsed, raw=response) diff --git a/convokit/genai/genai_config.py b/convokit/genai/genai_config.py new file mode 100644 index 00000000..aa089617 --- /dev/null +++ b/convokit/genai/genai_config.py @@ -0,0 +1,94 @@ +import os +from pathlib import Path +import yaml +from typing import Optional + + +class GenAIConfigManager: + """Manages configuration for GenAI clients, including setting and accessing API keys. + + Handles loading and saving of GenAI related configuration data, with support + for environment variable overrides. Provides a centralized way to manage API keys + and other configuration settings for different LLM providers. + + :param path: Path to the configuration file (default: ~/.convokit/config.yml) + """ + + def __init__(self, path: Optional[str] = None): + if path is None: + path = os.path.expanduser("~/.convokit/config.yml") + self.path = Path(path) + self._data = {} + self._load() + + def _load(self): + """Load configuration data from the YAML file. + + Creates the configuration file and directory if they don't exist. + """ + if self.path.exists(): + self._data = yaml.safe_load(self.path.read_text()) or {} + else: + self.path.parent.mkdir(parents=True, exist_ok=True) + self._data = {} + self._save() + + def _save(self): + """Save configuration data to the YAML file.""" + self.path.write_text(yaml.safe_dump(self._data)) + + def set_api_key(self, provider: str, key: str): + """Set an API key for a specific provider. + + :param provider: Name of the LLM provider (e.g., "gpt", "gemini") + :param key: API key for the provider + """ + self._data.setdefault("api_keys", {})[provider] = key + self._save() + + def get_api_key(self, provider: str) -> Optional[str]: + """Get the API key for a specific provider. + + First checks environment variables, then falls back to the configuration file. + + :param provider: Name of the LLM provider (e.g., "gpt", "gemini") + :return: API key if found, None otherwise + """ + env = os.getenv(f"{provider.upper()}_API_KEY") + if env: + return env + return self._data.get("api_keys", {}).get(provider) + + def set_google_cloud_config(self, project: str, location: str): + """Set Google Cloud configuration for Vertex AI. + + :param project: Google Cloud project ID + :param location: Google Cloud location (e.g., "us-central1") + """ + self._data.setdefault("google_cloud", {})["project"] = project + self._data["google_cloud"]["location"] = location + self._save() + + def get_google_cloud_project(self) -> Optional[str]: + """Get the Google Cloud project ID. + + First checks environment variables, then falls back to the configuration file. + + :return: Google Cloud project ID if found, None otherwise + """ + env = os.getenv("GOOGLE_CLOUD_PROJECT") + if env: + return env + return self._data.get("google_cloud", {}).get("project") + + def get_google_cloud_location(self) -> Optional[str]: + """Get the Google Cloud location. + + First checks environment variables, then falls back to the configuration file. + + :return: Google Cloud location if found, None otherwise + """ + env = os.getenv("GOOGLE_CLOUD_LOCATION") + if env: + return env + return self._data.get("google_cloud", {}).get("location") diff --git a/convokit/genai/gpt_client.py b/convokit/genai/gpt_client.py new file mode 100644 index 00000000..91ec8a45 --- /dev/null +++ b/convokit/genai/gpt_client.py @@ -0,0 +1,85 @@ +from openai import OpenAI, OpenAIError, RateLimitError, Timeout +from .base import LLMClient, LLMResponse +from .genai_config import GenAIConfigManager +import time + + +class GPTClient(LLMClient): + """Client for interacting with OpenAI GPT models. + + Provides an interface to generate text using OpenAI's GPT models through their API. + Handles authentication, request formatting, and error retry logic. API key is managed + through the GenAI config system. + + :param model: Name of the GPT model to use (default: "gpt-4o-mini") + :param config_manager: GenAIConfigManager instance (optional, will create one if not provided) + """ + + def __init__(self, model: str = "gpt-4o-mini", config_manager: GenAIConfigManager = None): + if config_manager is None: + config_manager = GenAIConfigManager() + + self.config_manager = config_manager + + # Get API key from config + api_key = config_manager.get_api_key("gpt") + if not api_key: + raise ValueError( + "OpenAI API key is required. " + "Set it using config_manager.set_api_key('gpt', 'your-key') " + "or via GPT_API_KEY environment variable." + ) + + self.client = OpenAI(api_key=api_key) + self.model = model + + def generate( + self, prompt, output_max_tokens=512, temperature=0.0, times_retried=0 + ) -> LLMResponse: + """Generate text using the GPT model. + + Sends a prompt to the GPT model and returns the generated response. Handles + different input formats (string or message list) and includes retry logic for + API errors. + + :param prompt: Input prompt for generation. Can be a string or list of message dicts + :param output_max_tokens: Maximum number of tokens to generate (default: 512) + :param temperature: Sampling temperature for generation (default: 0.0) + :param times_retried: Number of retry attempts made so far (for internal use) + :return: LLMResponse object containing the generated text and metadata + :raises Exception: If output error and retry attempts are exhausted + """ + start = time.time() + retry_after = 10 + + # Check prompt type to determine how to format messages + if isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + elif isinstance(prompt, list): + if all(isinstance(m, dict) and "role" in m and "content" in m for m in prompt): + messages = prompt + else: + raise ValueError( + "Invalid message format: each message must be a dict with 'role' and 'content'" + ) + else: + raise TypeError("Prompt must be either a string or a list of message dicts") + + try: + response = self.client.chat.completions.create( + model=self.model, + messages=messages, + max_tokens=output_max_tokens, + temperature=temperature, + ) + except (OpenAIError, RateLimitError, Timeout) as e: + if times_retried >= 3: + raise Exception("Retry failed after multiple attempts.") from e + print(f"{type(e).__name__}: {e}. Retrying in {retry_after}s...") + time.sleep(retry_after) + return self.generate(prompt, output_max_tokens, temperature, times_retried + 1) + + elapsed = time.time() - start + content = response.choices[0].message.content + tokens_used = response.usage.total_tokens if response.usage else -1 + return LLMResponse(text=content, tokens=tokens_used, latency=elapsed, raw=response) diff --git a/convokit/genai/llmprompttransformer.py b/convokit/genai/llmprompttransformer.py new file mode 100644 index 00000000..c27f4994 --- /dev/null +++ b/convokit/genai/llmprompttransformer.py @@ -0,0 +1,125 @@ +from typing import Optional, Union, Callable, Dict, Any +from convokit import Transformer, Corpus, Conversation, Speaker, Utterance +from .factory import get_llm_client +from .genai_config import GenAIConfigManager + + +class LLMPromptTransformer(Transformer): + """ + A ConvoKit Transformer that uses GenAI clients to process objects and store outputs as metadata. + + This transformer applies LLM prompts to different levels of the corpus (conversation, speaker, utterance, corpus) + using a formatter function to prepare the object data for the prompt, and stores the LLM responses as metadata. + + :param provider: LLM provider name ("gpt", "gemini", "local", etc.) + :param model: LLM model name + :param object_level: Object level at which to apply the transformer ("conversation", "speaker", "utterance", "corpus") + :param prompt: Template string for the prompt. Must contain '{formatted_object}' as a placeholder where the formatted object data will be inserted + :param formatter: Function that takes an object and returns a string representation that will replace the '{formatted_object}' placeholder in the prompt + :param metadata_name: Name of the metadata field to store the LLM response + :param selector: Optional function to filter which objects to process. Defaults to processing all objects + :param config_manager: GenAIConfigManager instance for LLM API key management + :param llm_kwargs: Additional keyword arguments to pass to the LLM client + """ + + def __init__( + self, + provider: str, + model: str, + object_level: str, + prompt: str, + formatter: Callable[[Union[Corpus, Conversation, Speaker, Utterance]], str], + metadata_name: str, + selector: Optional[ + Callable[[Union[Corpus, Conversation, Speaker, Utterance]], bool] + ] = None, + config_manager: Optional[GenAIConfigManager] = None, + llm_kwargs: Optional[Dict[str, Any]] = None, + ): + self.provider = provider + self.model = model + self.object_level = object_level + self.prompt = prompt + self.formatter = formatter + self.metadata_name = metadata_name + self.selector = selector or (lambda obj: True) + self.config_manager = config_manager or GenAIConfigManager() + self.llm_kwargs = llm_kwargs or {} + + if model is not None: + self.llm_kwargs["model"] = model + + if object_level not in ["conversation", "speaker", "utterance", "corpus"]: + raise ValueError( + f"Invalid object_level: {object_level}. Must be one of: conversation, speaker, utterance, corpus" + ) + + if "{formatted_object}" not in prompt: + raise ValueError( + "Prompt must contain '{formatted_object}' placeholder for the formatted object data" + ) + + self.llm_client = get_llm_client(provider, self.config_manager, **self.llm_kwargs) + + def _format_prompt(self, obj: Union[Corpus, Conversation, Speaker, Utterance]) -> str: + """ + Format the prompt with the object data using the formatter function. + + :param obj: Object to format + :return: Formatted prompt string + """ + try: + formatted_object = self.formatter(obj) + return self.prompt.format(formatted_object=formatted_object) + except Exception as e: + raise ValueError(f"Error formatting object for prompt: {e}") + + def _process_object(self, obj: Union[Corpus, Conversation, Speaker, Utterance]) -> None: + """ + Process a single object with the LLM and store the result in metadata. + + :param obj: Object to process + """ + try: + formatted_prompt = self._format_prompt(obj) + response = self.llm_client.generate(formatted_prompt) + obj.add_meta(self.metadata_name, response.text) + except Exception as e: + print(f"Error processing {self.object_level} {obj.id}: {e}") + obj.add_meta(self.metadata_name, None) + + def transform(self, corpus: Corpus) -> Corpus: + """ + Apply the GenAI transformer to the corpus. + + :param corpus: The corpus to transform + :return: The transformed corpus with LLM responses added as metadata + """ + if self.object_level == "utterance": + for utterance in corpus.iter_utterances(): + if self.selector(utterance): + self._process_object(utterance) + else: + utterance.add_meta(self.metadata_name, None) + + elif self.object_level == "conversation": + for conversation in corpus.iter_conversations(): + if self.selector(conversation): + self._process_object(conversation) + else: + conversation.add_meta(self.metadata_name, None) + + elif self.object_level == "speaker": + for speaker in corpus.iter_speakers(): + if self.selector(speaker): + self._process_object(speaker) + else: + speaker.add_meta(self.metadata_name, None) + + elif self.object_level == "corpus": + if self.selector(corpus): + self._process_object(corpus) + else: + corpus.add_meta(self.metadata_name, None) + + return corpus diff --git a/convokit/genai/local_client.py b/convokit/genai/local_client.py new file mode 100644 index 00000000..acb2aba9 --- /dev/null +++ b/convokit/genai/local_client.py @@ -0,0 +1,38 @@ +from .base import LLMClient, LLMResponse +from .genai_config import GenAIConfigManager +import time + + +class LocalClient(LLMClient): + """Template client for local LLM models. This is not a implemented client. + + This is a template implementation for local LLM clients. It provides a mock + implementation that should be replaced with actual local model loading and inference. + Currently returns mock responses for testing purposes. + + :param model_path: Path to the local model files (e.g., llama.cpp or GGUF model) + :param config_manager: GenAIConfigManager instance (optional, will create one if not provided) + """ + + def __init__(self, model_path: str = "./", config_manager: GenAIConfigManager = None): + if config_manager is None: + config_manager = GenAIConfigManager() + + self.config_manager = config_manager + self.model_path = model_path # e.g., load a llama.cpp or GGUF-backed model + + def generate(self, messages, **kwargs) -> LLMResponse: + """Generate text using the local model. + + Currently returns a mock response. This method should be implemented to + actually load and run the local model for text generation. + + :param messages: Input messages for generation + :param **kwargs: Additional generation parameters + :return: LLMResponse object containing the generated text and metadata + """ + start = time.time() + prompt = " ".join(m["content"] for m in messages) + response = f"[Mock local model output for: {prompt}]" + latency = time.time() - start + return LLMResponse(text=response, tokens=-1, latency=latency, raw={"prompt": prompt}) diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index 389fae45..9caca91d 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -17,6 +17,8 @@ These are the transformers related to generating some analysis of the Corpus. PairedPrediction Ranker SpeakerConvoDiversity + SCD + ConvoDynamicsSimilarity Redirection UtteranceLikelihood TalkTimeSharingDynamics \ No newline at end of file diff --git a/docs/source/condyns.rst b/docs/source/condyns.rst new file mode 100644 index 00000000..7dd8f62f --- /dev/null +++ b/docs/source/condyns.rst @@ -0,0 +1,47 @@ +Conversation Dynamics Similarity (ConDynS) +========================================== + +ConDynS is a similarity measure for comparing conversations with respect to their dynamics, as introduced in the paper `"A Similarity Measure for Comparing Conversational Dynamics" `_. The quality of a conversation goes beyond the individual quality of each reply, and instead emerges from how these combine into interactional patterns that give the conversation its distinctive overall "shape". ConDynS provides a robust automated method for comparing conversations in terms of their overall interactional dynamics. + +In this module, we provide a comprehensive framework for computing ConDynS, including: + +* **ConDynS**: Main similarity computation using bidirectional comparison between SCD patterns and conversation transcripts +* **NaiveConDynS**: Simplified similarity computation using only SoP comparison without transcripts +* **ConDynSBaselines**: Baseline methods for comparison including BERTScore, cosine similarity, and LLM-based direct comparison + +ConDynS builds on top of the `SCD (Summary of Conversation Dynamics) `_ module, which generates structured summaries of conversation dynamics. To compute ConDynS, you first need to extract SCD summaries from your conversations using the SCD transformer. + +Note that ConDynS computation requires access to a LLM. We provide a unified interface for working with LLMs in the `GenAI module `_. It is recommended to setup for GenAI models in the module beforehand to compute ConDynS. + +Usage Examples +-------------- + +We provide experiments notebooks from the paper: + +* `Validation experiments `_ +* `Baseline comparisons `_ +* `Applications to online communities `_ +* Applications on `WikiConv German `_ and `Friends `_ + +To see a simple example of using both SCD and ConDynS together, check out `this example notebook `_. + +Modules +------- + +ConDynS +^^^^^^^^ + +.. automodule:: convokit.convo_similarity.condyns + :members: + +NaiveConDynS +^^^^^^^^^^^^ + +.. automodule:: convokit.convo_similarity.naive_condyns + :members: + +Baseline Methods +^^^^^^^^^^^^^^^^ + +.. automodule:: convokit.convo_similarity.baseline + :members: \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 93a7d0d0..cc29b412 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -64,9 +64,9 @@ # built documents. # # The short X.Y version. -version = "3.5" +version = "3.6" # The full version, including alpha/beta/rc tags. -release = "3.5.0" +release = "3.6.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -376,4 +376,12 @@ "bson", "dnspython", "datasets", + "transformers", + "unsloth", + "sentence_transformers", + "evaluate", + "openai", + "google", + "google.genai", + "google.genai.types", ] diff --git a/docs/source/genai.rst b/docs/source/genai.rst new file mode 100644 index 00000000..0694a1b6 --- /dev/null +++ b/docs/source/genai.rst @@ -0,0 +1,109 @@ +GenAI +====== + +The GenAI module provides a unified interface for working with LLMs while doing conversational analysis in ConvoKit. The current implementation supports multiple providers including OpenAI GPT and Google Gemini, but is designed to be extensible to LLMs from other model providers and local models. This module makes it easy to integrate AI-powered text generation into your ConvoKit workflows for diverse tasks. The module handles API key management, response formatting, and provides consistent interfaces across different LLM providers. + +The module includes a ConvoKit transformer that allow you to apply LLM processing directly to corpus objects at different levels (utterances, conversations, speakers, or entire corpus), making it seamless to integrate AI analysis into your conversational data processing pipelines. + +Example usage: `GenAI module demo `_. + +Overview +-------- + +The GenAI module consists of several key components: + +* **LLMClient**: Abstract base class that defines the interface for all LLM clients +* **LLMResponse**: Unified response wrapper that standardizes output from different LLM providers +* **Factory Pattern**: Simple factory function to create appropriate client instances +* **Configuration Management**: Centralized API key and configuration management +* **Provider Clients**: Concrete implementations for different LLM providers (GPT, Gemini, Local) +* **GenAI Transformers**: ConvoKit transformers that apply LLM processing to corpus objects + +Basic Interface and Configuration +--------------------------------- + +.. automodule:: convokit.genai.base + :members: + +.. automodule:: convokit.genai.genai_config + :members: + +.. automodule:: convokit.genai.factory + :members: + +LLMPromptTransformer +^^^^^^^^^^^^^^^^^^^^ + +The LLMPromptTransformer is a flexible transformer that allows you to apply custom prompts and formatters to any level of corpus objects (utterances, conversations, speakers, or the entire corpus). It provides fine-grained control over how objects are formatted for LLM processing and where the results are stored. + +.. automodule:: convokit.genai.llmprompttransformer + :members: + +Provider Clients +---------------- + +Supported Providers +^^^^^^^^^^^^^^^^^^^ + +Currently supported LLM providers: + +* **OpenAI GPT**: Access to OpenAI GPT models through the OpenAI API. See `OpenAI API setup `_. +* **Google Gemini**: Access to Google Gemini models via Vertex AI. See `Vertex AI setup guide `_. +* **Local Models**: Template implementation for local LLM models (requires custom implementation) + +GPT Client +^^^^^^^^^^ + +.. automodule:: convokit.genai.gpt_client + :members: + +Gemini Client +^^^^^^^^^^^^^ + +.. automodule:: convokit.genai.gemini_client + :members: + +Local Client +^^^^^^^^^^^^ + +The LocalClient provides a template implementation for integrating local LLM models. The current implementation returns mock responses and serves as a starting point for implementing actual local model support. + +.. automodule:: convokit.genai.local_client + :members: + +Adding New Providers +^^^^^^^^^^^^^^^^^^^^ + +To add support for a new LLM provider: + +1. Create a new client class that inherits from `LLMClient` +2. Update the configuration manager to support the new provider +3. Implement the required `generate()` method and optionally `stream()` method if applicable +4. Add the provider to the factory function in `factory.py` + +Configuration +------------- + +The GenAIConfigManager handles API key storage and retrieval for different LLM providers. It supports: + +* **File-based storage**: Configuration is stored in `~/.convokit/config.yml` +* **Environment variables**: API keys can be set via environment variables (e.g., `GPT_API_KEY`) +* **Secure storage**: API keys are stored locally and not exposed in code +* **Provider-specific settings**: Support for different configuration requirements per provider (e.g., Google Cloud project settings for Gemini) + +**Basic Usage:** + +.. code-block:: python + + from convokit.genai.genai_config import GenAIConfigManager + + config = GenAIConfigManager() + + # Set OpenAI API key + config.set_api_key("gpt", "your-openai-api-key") + + # Set Google Cloud configuration for Gemini + config.set_google_cloud_config("your-project-id", "your-location") + + # Configuration is automatically saved and can be reused + diff --git a/docs/source/index.rst b/docs/source/index.rst index 81ff71b3..1366d9e8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -9,7 +9,7 @@ Cornell Conversational Analysis Toolkit (ConvoKit) Documentation This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a `single unified interface `_ inspired by (and compatible with) scikit-learn. Several large `conversational datasets `_ are included together with scripts exemplifying the use of the toolkit on these datasets. -More information can be found at our `website `_. The latest version is `3.5.0 `_ (released Oct. 15, 2025). +More information can be found at our `website `_. The latest version is `3.6.0 `_ (released Oct. 25, 2025). Contents -------- diff --git a/docs/source/scd.rst b/docs/source/scd.rst new file mode 100644 index 00000000..992cd31d --- /dev/null +++ b/docs/source/scd.rst @@ -0,0 +1,25 @@ +Summary of Conversation Dynamics (SCD) +======================================== + +SCD (Summary of Conversation Dynamics) is a ConvoKit Transformer that generates summaries of conversational dynamics from conversation transcripts, as introduced in the paper `"How did we get here? Summarizing conversation dynamics" `_. + +SCD extracts structured representations of conversation dynamics in two forms: + +* **Summary of Conversation Dynamics (SCD)**: A summary describing the overall dynamics in a conversation +* **Sequence of Patterns (SoP)**: A structured sequence of interaction patterns extracted from the SCD, introduced in the paper `"A Similarity Measure for Comparing Conversational Dynamics" `_ + +Note that SCD computation requires access to a LLM. We provide a unified interface for working with LLMs in the `GenAI module `_. It is recommended to setup for GenAI models in the module beforehand to compute SCD. + +Usage Examples +-------------- + +To see the use of SCD Transformer in action, check out: + +* `Simple example notebook `_ showcasing basic SCD usage. + +Module Reference +---------------- + +.. automodule:: convokit.convo_similarity.scd + :members: + diff --git a/docs/source/utilities.rst b/docs/source/utilities.rst index 2d139183..687a87a4 100644 --- a/docs/source/utilities.rst +++ b/docs/source/utilities.rst @@ -9,3 +9,4 @@ Miscellaneous utility functions for managing datasets Util Speaker Conversation Utilities Pipeline + GenAI diff --git a/setup.py b/setup.py index e5ba5ebf..796b4168 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ author_email="cristian@cs.cornell.edu", url="https://github.com/CornellNLP/ConvoKit", description="ConvoKit", - version="3.5.0", + version="3.6.0", packages=[ "convokit", "convokit.bag_of_words", @@ -36,12 +36,15 @@ "convokit.surprise", "convokit.pivotal_framework", "convokit.utterance_simulator", + "convokit.genai", + "convokit.convo_similarity", ], package_data={ "convokit": [ "data/*.txt", "politeness_collections/politeness_local/lexicons/*.json", "politeness_collections/politeness_cscw_zh/lexicons/*.json", + "convo_similarity/prompts/*.txt", ] }, install_requires=[ @@ -66,11 +69,25 @@ "numexpr>=2.8.0", "ruff>=0.4.8", "bottleneck", + "accelerate", + "peft", + "bitsandbytes", + "transformers", + "unsloth", + "trl>=0.12.2", + "tensorflow>=2.18.0", + "tf-keras>=2.17.0,<3.0.0", + "evaluate", + "sentence-transformers", "datasets", ], extras_require={ "craft": ["torch>=0.12"], "forecaster": ["torch>=0.12", "datasets"], + "genai": [ + "openai>=1.3.5", + "google-genai", + ], "llm": [ "torch>=0.12", "accelerate",