From aff3b5f4dde7064486867fa80705046bcec983f0 Mon Sep 17 00:00:00 2001 From: Jamie Ip Date: Wed, 11 Nov 2020 18:52:02 -0800 Subject: [PATCH 1/8] edited my created tests --- consensus_and_scoring/test/test_IAA_jamie.py | 30 +++++++++++++------- consensus_and_scoring/test/test_dep_jamie.py | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/consensus_and_scoring/test/test_IAA_jamie.py b/consensus_and_scoring/test/test_IAA_jamie.py index a452417..5f21bc7 100644 --- a/consensus_and_scoring/test/test_IAA_jamie.py +++ b/consensus_and_scoring/test/test_IAA_jamie.py @@ -39,25 +39,33 @@ def test_user_highlighting_consensus(config, tmpdir): #N users on schema v1 and N users on schema v2--ensure output rows identical def test_diff_schemas(config, tmpdir): test_path = test_utils.make_test_directory(config, 'test_diff_schemas') - out_path = test_utils.make_test_directory(config, 'out_test_diff_schemas') + out_path = test_utils.make_test_directory(config, 'test_diff_schemas_out') #Covid_Evidence2020_03_21_copy is a copy with Q13 set to Ordinal, which should be detected as a new schema for x in [('jamietest_old', 'Covid_Evidence2020_03_21'), ('jamietest_new', 'Covid_Evidence2020_03_21_copy')]: dh = datahunt(out_folder=test_path, source_task_id = x[0]) dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': x[1], 'contributor_uuid':'A'}) - dh.add_row({'answer_label': 'T1.Q1.A3', 'namespace': x[1], 'contributor_uuid':'B'}) - dh.add_row({'answer_label': 'T1.Q3.A1', 'namespace': x[1], 'contributor_uuid':'C'}) - dh.add_row({'answer_label': 'T1.Q14.A1', 'namespace': x[1], 'contributor_uuid':'D'}) - dh.add_row({'answer_label': 'T1.Q14.A10', 'namespace': x[1], 'contributor_uuid':'E'}) - dh.add_row({'answer_label': 'T1.Q14.A10', 'namespace': x[1], 'contributor_uuid':'F'}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': x[1], 'contributor_uuid':'A'}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': x[1], 'contributor_uuid':'B'}) + dh.add_row({'answer_label': 'T1.Q1.A3', 'namespace': x[1], 'contributor_uuid':'A'}) + dh.add_row({'answer_label': 'T1.Q2.A1', 'namespace': x[1], 'contributor_uuid':'A'}) + dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': x[1], 'contributor_uuid':'A'}) + dh.add_row({'answer_label': 'T1.Q2.A8', 'namespace': x[1], 'contributor_uuid':'A'}) + dh.add_row({'answer_label': 'T1.Q2.A7', 'namespace': x[1], 'contributor_uuid':'B'}) + dh.add_row({'answer_label': 'T1.Q2.A8', 'namespace': x[1], 'contributor_uuid':'B'}) + dh.add_row({'answer_label': 'T1.Q3.A1', 'namespace': x[1], 'contributor_uuid':'A'}) fin_path = dh.export() data_path = config['data_dir'] schema_path = config['persistent_test_dir']+'/schemas' iaa_out = calc_agreement_directory(test_path, schema_path, config['IAA_config_dir'], test_utils.texts_dir, outDirectory = out_path) for root, dir, files in os.walk(iaa_out): - out_df_old = pd.read_csv(os.path.join(iaa_out, files[0]), encoding='utf-8') - out_df_new = pd.read_csv(os.path.join(iaa_out, files[1]), encoding='utf-8') - out_df_new = out_df_new.drop(['schema_sha256', 'namespace'], axis=1) - out_df_old = out_df_old.drop(['schema_sha256', 'namespace'], axis=1) + out_df_old = pd.read_csv(os.path.join(iaa_out, 'DataHunt_jamietest_old.IAA-Default-Tags.csv'), encoding='utf-8') + out_df_new = pd.read_csv(os.path.join(iaa_out, 'DataHunt_jamietest_new.IAA-Default-Tags.csv'), encoding='utf-8') - assert out_df_old.equals(out_df_new) + assert out_df_old.equals(out_df_new) == False + + schema_columns = ['article_sha256', 'article_id', 'schema_sha256', 'namespace'] + out_df_old = out_df_old.drop(schema_columns, axis=1) + out_df_new = out_df_new.drop(schema_columns, axis=1) + + assert out_df_old.equals(out_df_new) == True diff --git a/consensus_and_scoring/test/test_dep_jamie.py b/consensus_and_scoring/test/test_dep_jamie.py index 16852e4..c4f6402 100644 --- a/consensus_and_scoring/test/test_dep_jamie.py +++ b/consensus_and_scoring/test/test_dep_jamie.py @@ -60,7 +60,7 @@ def test_bad_parent(config): iaa_files_path = test_utils.make_test_directory(config, 'dep_bad_dad') out_path = test_utils.make_test_directory(config, 'dep_bad_dad_out') - parents = {1:[2], 2:[3,4,5,7,8], 5:[6], 9:[10,11]} + parents = {1:[2], 2:[3,4,5], 5:[6], 9:[10,11]} childNumAnswers = {2:9, 3:1, 4:6, 5:5, 6:3, 7:1, 8:5, 10:5, 11:5} for parent in parents: iaa = IAA_task(out_folder=iaa_files_path, source_task_id='gru' + str(parent)) From 8cc3eef3086625a111d399d41e2402e1b5f69e7e Mon Sep 17 00:00:00 2001 From: Jamie Ip Date: Wed, 11 Nov 2020 21:58:41 -0800 Subject: [PATCH 2/8] laid groundwork for agreemnt scores --- consensus_and_scoring/AgreementScoring.py | 12 ++++++ consensus_and_scoring/Dependency.py | 3 ++ .../test/test_agreement_score.py | 37 +++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 consensus_and_scoring/AgreementScoring.py create mode 100644 consensus_and_scoring/test/test_agreement_score.py diff --git a/consensus_and_scoring/AgreementScoring.py b/consensus_and_scoring/AgreementScoring.py new file mode 100644 index 0000000..59d5e3d --- /dev/null +++ b/consensus_and_scoring/AgreementScoring.py @@ -0,0 +1,12 @@ +import pandas as pd +import numpy as np + +def AgreementScore(iaaData, schemaPath): + print("AGREEMENT SCORING TIME!!!") + print("OLD AGREEMENT SCORES:") + print(iaaData['agreement_score']) + #TODO: AGREEMENT SCORE CHANGES HERE + #iaaData['agreement_score'] = np.zeros(3) + print("NEW AGREEMENT SCORES:") + print(iaaData['agreement_score']) + return iaaData diff --git a/consensus_and_scoring/Dependency.py b/consensus_and_scoring/Dependency.py index 4d82197..87e6524 100644 --- a/consensus_and_scoring/Dependency.py +++ b/consensus_and_scoring/Dependency.py @@ -4,6 +4,7 @@ import os import json from dataV3 import * +from AgreementScoring import * def eval_dependency(directory, iaa_dir, schema_dir, out_dir): print("DEPENDENCY STARTING") @@ -132,6 +133,8 @@ def handleDependencies(schemaPath, iaaPath, out_dir): indices = merge_indices(row_indices, indices).tolist() iaaData.at[row, 'highlighted_indices'] = json.dumps(indices) + iaaData = AgreementScore(iaaData, schemaPath) + print('exporting to csv') path, name = get_path(iaaPath) outputpath = os.path.join(out_dir, 'Dep_'+name) diff --git a/consensus_and_scoring/test/test_agreement_score.py b/consensus_and_scoring/test/test_agreement_score.py new file mode 100644 index 0000000..c61f6e5 --- /dev/null +++ b/consensus_and_scoring/test/test_agreement_score.py @@ -0,0 +1,37 @@ +import sys +import os +import pandas as pd + +import test_utils +from filegen_utils import * +from Dependency import * +import conftest + +#REFERENCE: in Evidence, parents = {1.1:[2], 1.2:[2], 2.1:[4], 2.5:[4,5], 2.8:[3], 5.1:[6], 5.2:[6], 5.3:[6], 9.1:[10,11], 9.2:[10,11]} +def test_dep_sample(config): + iaa_files_path = test_utils.make_test_directory(config, 'dep_sample') + out_path = test_utils.make_test_directory(config, 'out_dep_sample') + # source_task_id generated by smashing keyboard + iaa = IAA_task(out_folder=iaa_files_path, source_task_id='kjncsa87nxao21899102j1j2') + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 1, "agreed_Answer": 1, "agreement_score": .75}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 1, "agreed_Answer": 1, "agreement_score": .75}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 1, "agreed_Answer": 1, "agreement_score": .75}) + fin_path = iaa.export() + data_path = config['data_dir'] + schema_path = data_path + '/schemas' + dh_path = None #doesn't get used by dependency but is still an argument + + eval_dependency(dh_path, iaa_files_path, schema_path, out_path) + + for root, dir, files in os.walk(out_path): + for file in files: + #should be only 1 file for this case, so just run it on the only one + # if there's more than 1 then you can get fancy + out_df = pd.read_csv(os.path.join(out_path, file), encoding='utf-8') + #9 answer choices to a checklist question + # assert len(out_df) == 2 + # q_three = out_df[out_df['question_Number']==2] + # hl = q_three['highlighted_indices'].iloc[0] + # assert len(hl) >18 + # assert '10' in hl + # assert '29' in hl From 3e4a43d8be7f46c35b8771d2acc1a6bb1e3e04ea Mon Sep 17 00:00:00 2001 From: Jamie Ip Date: Thu, 12 Nov 2020 13:54:34 -0800 Subject: [PATCH 3/8] implemented agremeent scoring by parent weights --- consensus_and_scoring/AgreementScoring.py | 42 +++++++++++++++++-- .../test/test_agreement_score.py | 34 ++++++++++++--- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/consensus_and_scoring/AgreementScoring.py b/consensus_and_scoring/AgreementScoring.py index 59d5e3d..55fa407 100644 --- a/consensus_and_scoring/AgreementScoring.py +++ b/consensus_and_scoring/AgreementScoring.py @@ -1,12 +1,48 @@ import pandas as pd import numpy as np +import re +from dataV3 import create_dependencies_dict def AgreementScore(iaaData, schemaPath): print("AGREEMENT SCORING TIME!!!") print("OLD AGREEMENT SCORES:") - print(iaaData['agreement_score']) + print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) #TODO: AGREEMENT SCORE CHANGES HERE - #iaaData['agreement_score'] = np.zeros(3) + schemData = pd.read_csv(schemaPath, encoding = 'utf-8') + dependencies = create_dependencies_dict(schemData) + iaaQuestions = iaaData['question_Number'].tolist() + for child in dependencies.keys(): + if child not in iaaQuestions: + continue + parents = dependencies[child].keys() + #TODO: clean this up + temp = [] + for parent in parents: + answers = dependencies[child][parent] + parentScores = iaaData[(iaaData['question_Number'] == parent)] + parentScores = parentScores[parentScores['agreed_Answer'].astype(int).isin(answers)] + temp.append(np.mean(parentScores['agreement_score'])) + avgParentScores = np.mean(temp) + iaaData['agreement_score'] = np.where(iaaData['question_Number'] == child, iaaData['agreement_score'] * avgParentScores, iaaData['agreement_score']) + #iaaData['agreement_score'] = np.zeros(3) print("NEW AGREEMENT SCORES:") - print(iaaData['agreement_score']) + print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) return iaaData + +# Creates a dictionary of Parent Question: Answer: Child Questions +# ex. {1: {1: [2], 2: [2]}, 2: {1: [4], 5: [4, 5], 8: [3]}, 5: {1: [6], 2: [6], 3: [6]}, 9: {1: [10, 11], 2: [10, 11]}} +# T1.Q1.A1 changes T1.Q2, etc. +# def create_parents_dict(schemadata): +# df = schemadata[schemadata['answer_next_questions'].notna()] +# parents = df['answer_label'].tolist() +# children = df['answer_next_questions'].tolist() +# dependencies = {} +# for i in range(len(parents)): +# parent_q = int(re.findall(r"Q(\d+)", parents[i])[0]) +# parent_a = int(re.findall(r"A(\d+)", parents[i])[0]) +# child_q = [int(q) for q in re.findall(r"Q(\d+)", children[i])] +# if parent_q not in dependencies: +# dependencies[parent_q] = {parent_a:child_q} +# else: +# dependencies[parent_q][parent_a] = child_q +# return dependencies diff --git a/consensus_and_scoring/test/test_agreement_score.py b/consensus_and_scoring/test/test_agreement_score.py index c61f6e5..0eb945e 100644 --- a/consensus_and_scoring/test/test_agreement_score.py +++ b/consensus_and_scoring/test/test_agreement_score.py @@ -9,13 +9,13 @@ #REFERENCE: in Evidence, parents = {1.1:[2], 1.2:[2], 2.1:[4], 2.5:[4,5], 2.8:[3], 5.1:[6], 5.2:[6], 5.3:[6], 9.1:[10,11], 9.2:[10,11]} def test_dep_sample(config): - iaa_files_path = test_utils.make_test_directory(config, 'dep_sample') - out_path = test_utils.make_test_directory(config, 'out_dep_sample') + iaa_files_path = test_utils.make_test_directory(config, 'agscore_sample') + out_path = test_utils.make_test_directory(config, 'agscore_sample_out') # source_task_id generated by smashing keyboard - iaa = IAA_task(out_folder=iaa_files_path, source_task_id='kjncsa87nxao21899102j1j2') - iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 1, "agreed_Answer": 1, "agreement_score": .75}) - iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 1, "agreed_Answer": 1, "agreement_score": .75}) - iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 1, "agreed_Answer": 1, "agreement_score": .75}) + iaa = IAA_task(out_folder=iaa_files_path, source_task_id='agscore_test') + iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":1, "agreement_score":.5, 'highlighted_indices':test_utils.make_highlight_indices(10,30)}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 1, "agreement_score": .5}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 4, "agreed_Answer": 1, "agreement_score": .5}) fin_path = iaa.export() data_path = config['data_dir'] schema_path = data_path + '/schemas' @@ -35,3 +35,25 @@ def test_dep_sample(config): # assert len(hl) >18 # assert '10' in hl # assert '29' in hl + +def test_dep_sample2(config): + iaa_files_path = test_utils.make_test_directory(config, 'agscore_sample2') + out_path = test_utils.make_test_directory(config, 'agscore_sample2_out') + # source_task_id generated by smashing keyboard + iaa = IAA_task(out_folder=iaa_files_path, source_task_id='agscore_test') + iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":1, "agreement_score":.5}) + iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":2, "agreement_score":1}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 1, "agreement_score": 1}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 2, "agreement_score": .5}) + fin_path = iaa.export() + data_path = config['data_dir'] + schema_path = data_path + '/schemas' + dh_path = None #doesn't get used by dependency but is still an argument + + eval_dependency(dh_path, iaa_files_path, schema_path, out_path) + + for root, dir, files in os.walk(out_path): + for file in files: + #should be only 1 file for this case, so just run it on the only one + # if there's more than 1 then you can get fancy + out_df = pd.read_csv(os.path.join(out_path, file), encoding='utf-8') From ceea4f342b185f5219c488d2a5ee867bc82b86f2 Mon Sep 17 00:00:00 2001 From: Jamie Ip Date: Thu, 19 Nov 2020 00:22:47 -0800 Subject: [PATCH 4/8] working on highlighting agreement scoring --- consensus_and_scoring/AgreementScoring.py | 44 +++++++++++++------- consensus_and_scoring/ChecklistCoding.py | 6 ++- consensus_and_scoring/test/test_IAA_basic.py | 37 ++++++++++++++++ 3 files changed, 72 insertions(+), 15 deletions(-) create mode 100644 consensus_and_scoring/test/test_IAA_basic.py diff --git a/consensus_and_scoring/AgreementScoring.py b/consensus_and_scoring/AgreementScoring.py index 55fa407..afbb864 100644 --- a/consensus_and_scoring/AgreementScoring.py +++ b/consensus_and_scoring/AgreementScoring.py @@ -2,6 +2,21 @@ import numpy as np import re from dataV3 import create_dependencies_dict +from nltk import agreement + +def highlightAgreementScore(starts, ends): + print("HIGHLIGHT AGREEMENT SCORING TIME!!!") + return 666 + +coder1 = [1,0,2,0,1,1,2,0,1,1] +coder2 = [1,1,0,0,1,1,2,1,1,0] +coder3 = [1,2,2,1,2,1,2,1,1,0] +formatted_codes = [[1,i,coder1[i]] for i in range(len(coder1))] + [[2,i,coder2[i]] for i in range(len(coder2))] + [[3,i,coder3[i]] for i in range(len(coder3))] +ratingtask = agreement.AnnotationTask(data=formatted_codes) + +print('Fleiss\'s Kappa:',ratingtask.multi_kappa()) +print('Krippendorff\'s alpha:',ratingtask.alpha()) +print('Scott\'s pi:',ratingtask.pi()) def AgreementScore(iaaData, schemaPath): print("AGREEMENT SCORING TIME!!!") @@ -32,17 +47,18 @@ def AgreementScore(iaaData, schemaPath): # Creates a dictionary of Parent Question: Answer: Child Questions # ex. {1: {1: [2], 2: [2]}, 2: {1: [4], 5: [4, 5], 8: [3]}, 5: {1: [6], 2: [6], 3: [6]}, 9: {1: [10, 11], 2: [10, 11]}} # T1.Q1.A1 changes T1.Q2, etc. -# def create_parents_dict(schemadata): -# df = schemadata[schemadata['answer_next_questions'].notna()] -# parents = df['answer_label'].tolist() -# children = df['answer_next_questions'].tolist() -# dependencies = {} -# for i in range(len(parents)): -# parent_q = int(re.findall(r"Q(\d+)", parents[i])[0]) -# parent_a = int(re.findall(r"A(\d+)", parents[i])[0]) -# child_q = [int(q) for q in re.findall(r"Q(\d+)", children[i])] -# if parent_q not in dependencies: -# dependencies[parent_q] = {parent_a:child_q} -# else: -# dependencies[parent_q][parent_a] = child_q -# return dependencies +# I wrote this function and it works but didn't actually end up using it since create_dependencies_dict was better +def create_parents_dict(schemadata): + df = schemadata[schemadata['answer_next_questions'].notna()] + parents = df['answer_label'].tolist() + children = df['answer_next_questions'].tolist() + dict = {} + for i in range(len(parents)): + parent_q = int(re.findall(r"Q(\d+)", parents[i])[0]) + parent_a = int(re.findall(r"A(\d+)", parents[i])[0]) + child_q = [int(q) for q in re.findall(r"Q(\d+)", children[i])] + if parent_q not in dict: + dict[parent_q] = {parent_a:child_q} + else: + dict[parent_q][parent_a] = child_q + return dict diff --git a/consensus_and_scoring/ChecklistCoding.py b/consensus_and_scoring/ChecklistCoding.py index bf69c65..8c6e7a1 100644 --- a/consensus_and_scoring/ChecklistCoding.py +++ b/consensus_and_scoring/ChecklistCoding.py @@ -1,4 +1,5 @@ from CodingScoring import * +from AgreementScoring import highlightAgreementScore #from repScores import * def scoreChecklist(answers,numUsers, num_choices): @@ -50,8 +51,11 @@ def evaluateChecklist(answers, users, starts, ends, numUsers, length, repDF,sour weightScaledNumUsers, userWeightDict, sourceText, useRep=useRep, threshold_func = threshold_func) firstSecondDiff = 1 - codingScore + + print("STARTS:",starts,"ENDS:",ends) + hlAgreeFactor = highlightAgreementScore(starts, ends) + #out.append(hlAgreeFactor) out.append([winner,units,uScore,iScore, codingScore, numUsers, selectedText, firstSecondDiff, 'checklist', num_choices]) #do_rep_calculation_nominal(users, answers, out[0], units, starts, ends, length, repDF,last30, checkListScale=(1/num_choices)) return out - diff --git a/consensus_and_scoring/test/test_IAA_basic.py b/consensus_and_scoring/test/test_IAA_basic.py new file mode 100644 index 0000000..da3eb8a --- /dev/null +++ b/consensus_and_scoring/test/test_IAA_basic.py @@ -0,0 +1,37 @@ +import sys +import os +import pandas as pd + +import test_utils +from filegen_utils import * +from IAA import * + +sys.path.append('../../') + +def test_iaa_constructor(config, tmpdir): + test_path = test_utils.make_test_directory(config, 'test_basic_b') + out_path = test_utils.make_test_directory(config, 'test_basic_b_out') + #source_task_id generated by smashing keyboard + dh = datahunt(out_folder=test_path, source_task_id = 'oogabooga') + + # dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'start_pos':1, 'end_pos':4}) + # dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'start_pos':2, 'end_pos':4}) + # dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'start_pos':1, 'end_pos':4}) + # dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'start_pos':1, 'end_pos':4}) + + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) + dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) + + fin_path = dh.export() + data_path = config['data_dir'] + schema_path = data_path+'/schemas' + + iaa_out = calc_agreement_directory(test_path, schema_path, config['IAA_config_dir'], test_utils.texts_dir, outDirectory = out_path) + print(iaa_out) + for root, dir, files in os.walk(iaa_out): + for file in files: + #should be only 1 file for this case, so just run it on the only one + # if there's more than 1 then you can get fancy + out_df = pd.read_csv(os.path.join(iaa_out, file), encoding='utf-8') From 263847ef6500e02f594d7a6cf5aa00fbf6eeac68 Mon Sep 17 00:00:00 2001 From: Jamie Ip Date: Thu, 19 Nov 2020 16:09:38 -0800 Subject: [PATCH 5/8] finished highlight agreement scores for checklist questions --- consensus_and_scoring/AgreementScoring.py | 88 ++++++++++++++++--- consensus_and_scoring/ChecklistCoding.py | 27 ++++-- consensus_and_scoring/Dependency.py | 2 +- consensus_and_scoring/test/test_IAA_basic.py | 10 ++- .../test/test_agreement_score.py | 56 ++++-------- consensus_and_scoring/test/test_dep_basic.py | 52 +++++++++++ 6 files changed, 174 insertions(+), 61 deletions(-) create mode 100644 consensus_and_scoring/test/test_dep_basic.py diff --git a/consensus_and_scoring/AgreementScoring.py b/consensus_and_scoring/AgreementScoring.py index afbb864..73a595a 100644 --- a/consensus_and_scoring/AgreementScoring.py +++ b/consensus_and_scoring/AgreementScoring.py @@ -4,22 +4,47 @@ from dataV3 import create_dependencies_dict from nltk import agreement +#Takes in starts and ends of highlights for a specific question answer, returns factor to scale answer's agreement score by def highlightAgreementScore(starts, ends): + assert len(starts) == len(ends) + if len(starts) == 0: + return 0 + if len(starts) == 1: + return 1 + print("HIGHLIGHT AGREEMENT SCORING TIME!!!") - return 666 + first_start = min(starts) + last_end = max(ends) + 1 + coders = [] + #Creates a list of each annotator's highlights as a list where 0 is an unhighlighted index and 1 is a highlighted index + #e.g highlightAgreementScore([4, 3, 2], [6, 7, 5]) becomes [[0,0,1,1,1,0], [0,1,1,1,1,1], [1,1,1,1,0,0]] + for i in range(len(starts)): + highlights = np.zeros(last_end - first_start) + highlights[[x for x in range(starts[i] - first_start, ends[i] - first_start + 1)]] = 1 + print("Highlights " + str(i+1) + ": ", highlights) + coders.append(highlights) + + #Formats the codes properly as (coder,item,label) tuples + formatted_codes = [] + for annotator_num in range(len(coders)): + coder = coders[annotator_num] + formatted_codes += [[annotator_num+1, ind, coder[ind]] for ind in range(len(coder))] + ratingtask = agreement.AnnotationTask(data=formatted_codes) -coder1 = [1,0,2,0,1,1,2,0,1,1] -coder2 = [1,1,0,0,1,1,2,1,1,0] -coder3 = [1,2,2,1,2,1,2,1,1,0] -formatted_codes = [[1,i,coder1[i]] for i in range(len(coder1))] + [[2,i,coder2[i]] for i in range(len(coder2))] + [[3,i,coder3[i]] for i in range(len(coder3))] -ratingtask = agreement.AnnotationTask(data=formatted_codes) + avgAg = ratingtask.avg_Ao() + print('AVERAGE PAIRWISE AGREEMENT: ',avgAg) + # alpha = ratingtask.alpha() + # print('Krippendorff\'s alpha:',alpha) + # if alpha != 1: #other metrics error if alpha is 1 + # print('Fleiss\'s Kappa:',ratingtask.multi_kappa()) + # print('Scott\'s pi:',ratingtask.pi()) + return avgAg -print('Fleiss\'s Kappa:',ratingtask.multi_kappa()) -print('Krippendorff\'s alpha:',ratingtask.alpha()) -print('Scott\'s pi:',ratingtask.pi()) +highlightAgreementScore([2, 2, 2, 2, 2], [15, 15, 15, 15, 15]) +#Parent Agrement Scoring def AgreementScore(iaaData, schemaPath): - print("AGREEMENT SCORING TIME!!!") + print("PARENT AGREEMENT SCORING TIME!!!") print("OLD AGREEMENT SCORES:") print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) #TODO: AGREEMENT SCORE CHANGES HERE @@ -44,6 +69,49 @@ def AgreementScore(iaaData, schemaPath): print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) return iaaData +#Agreement scoring but scores of parents of parents don't affect children +def AgreementScoreReverse(iaaData, schemaPath): + print("PARENT AGREEMENT SCORING TIME!!!") + print("OLD AGREEMENT SCORES:") + print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) + #TODO: AGREEMENT SCORE CHANGES HERE + schemData = pd.read_csv(schemaPath, encoding = 'utf-8') + dependencies = create_dependencies_dict(schemData) + iaaQuestions = iaaData['question_Number'].tolist() + reversed_keys = list(dependencies.keys())[::-1] + for child in reversed_keys: + if child not in iaaQuestions: + continue + parents = dependencies[child].keys() + #TODO: clean this up + temp = [] + for parent in parents: + answers = dependencies[child][parent] + parentScores = iaaData[(iaaData['question_Number'] == parent)] + parentScores = parentScores[parentScores['agreed_Answer'].astype(int).isin(answers)] + temp.append(np.mean(parentScores['agreement_score'])) + avgParentScores = np.mean(temp) + iaaData['agreement_score'] = np.where(iaaData['question_Number'] == child, iaaData['agreement_score'] * avgParentScores, iaaData['agreement_score']) + #iaaData['agreement_score'] = np.zeros(3) + print("NEW AGREEMENT SCORES:") + print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) + return iaaData + +#Just for testing out the differences betwene metrics +def metricTest(): + coder1 = [1,0,2,0,1,1,2,0,1,1] + coder2 = [1,1,0,0,1,1,2,1,1,0] + coder3 = [1,2,2,1,2,1,2,1,1,0] + formatted_codes = [[1,i,coder1[i]] for i in range(len(coder1))] + [[2,i,coder2[i]] for i in range(len(coder2))] + [[3,i,coder3[i]] for i in range(len(coder3))] + print('RUNNING METRIC TEST') + ratingtask = agreement.AnnotationTask(data=formatted_codes) + print('Average pairwise agreement: ',ratingtask.avg_Ao()) + print('Cohen\'s Kappa:',ratingtask.kappa()) + print('Fleiss\'s Kappa:',ratingtask.multi_kappa()) + print('Krippendorff\'s alpha:',ratingtask.alpha()) + print('Scott\'s pi:',ratingtask.pi()) +#metricTest() + # Creates a dictionary of Parent Question: Answer: Child Questions # ex. {1: {1: [2], 2: [2]}, 2: {1: [4], 5: [4, 5], 8: [3]}, 5: {1: [6], 2: [6], 3: [6]}, 9: {1: [10, 11], 2: [10, 11]}} # T1.Q1.A1 changes T1.Q2, etc. diff --git a/consensus_and_scoring/ChecklistCoding.py b/consensus_and_scoring/ChecklistCoding.py index 8c6e7a1..8ecdd58 100644 --- a/consensus_and_scoring/ChecklistCoding.py +++ b/consensus_and_scoring/ChecklistCoding.py @@ -2,7 +2,7 @@ from AgreementScoring import highlightAgreementScore #from repScores import * -def scoreChecklist(answers,numUsers, num_choices): +def scoreChecklist(answers,numUsers, num_choices, starts, ends): out = [] #print('answers', answers, num_choices) length = num_choices+1 @@ -11,9 +11,26 @@ def scoreChecklist(answers,numUsers, num_choices): scores = np.zeros(length) for a in answers: scores[a] = scores[a]+1 + + starts_i = {} + ends_i = {} + for i in range(len(answers)): + a = answers[i] + if a not in starts_i: + starts_i[a] = [starts[i]] + ends_i[a] = [ends[i]] + else: + starts_i[a] += [starts[i]] + ends_i[a] += [ends[i]] + print(starts_i, ends_i) + for i in range(len(scores)): #print('scores', scores, numUsers) - out.append(scores[i]/numUsers) + hlAgreeFactor = 1 + if i in starts_i: + hlAgreeFactor = highlightAgreementScore(starts_i[i], ends_i[i]) + out.append(scores[i]/numUsers * hlAgreeFactor) + return out def evaluateChecklist(answers, users, starts, ends, numUsers, length, repDF,sourceText, hlUsers, hlAns, @@ -22,7 +39,7 @@ def evaluateChecklist(answers, users, starts, ends, numUsers, length, repDF,sour repScaledAnswers, repScaledUsers = repScaleAnsUsers(answers, users, repDF, useRep=useRep) #assert len(starts) == len(users), 'starts, users mismatched' #TODO: scale numUsers when repScaled gets scaled up - percArray = scoreChecklist(repScaledAnswers, numUsers, num_choices) + percArray = scoreChecklist(repScaledAnswers, numUsers, num_choices, starts, ends) out = [] for i in range(1,len(percArray)): codingScore = percArray[i] @@ -51,10 +68,6 @@ def evaluateChecklist(answers, users, starts, ends, numUsers, length, repDF,sour weightScaledNumUsers, userWeightDict, sourceText, useRep=useRep, threshold_func = threshold_func) firstSecondDiff = 1 - codingScore - - print("STARTS:",starts,"ENDS:",ends) - hlAgreeFactor = highlightAgreementScore(starts, ends) - #out.append(hlAgreeFactor) out.append([winner,units,uScore,iScore, codingScore, numUsers, selectedText, firstSecondDiff, 'checklist', num_choices]) #do_rep_calculation_nominal(users, answers, out[0], units, starts, ends, length, repDF,last30, checkListScale=(1/num_choices)) diff --git a/consensus_and_scoring/Dependency.py b/consensus_and_scoring/Dependency.py index 87e6524..8d66f2d 100644 --- a/consensus_and_scoring/Dependency.py +++ b/consensus_and_scoring/Dependency.py @@ -15,7 +15,7 @@ def eval_dependency(directory, iaa_dir, schema_dir, out_dir): # minimal check here; everything in the schema directory should be a schema csv if file.endswith('.csv'): file_path = os.path.join(dirpath, file) - print("found schema " + file_path) + #print("found schema " + file_path) schema.append(file_path) print("looking for IAA", iaa_dir) for dirpath, dirnames, files in os.walk(iaa_dir): diff --git a/consensus_and_scoring/test/test_IAA_basic.py b/consensus_and_scoring/test/test_IAA_basic.py index da3eb8a..509ed46 100644 --- a/consensus_and_scoring/test/test_IAA_basic.py +++ b/consensus_and_scoring/test/test_IAA_basic.py @@ -19,10 +19,11 @@ def test_iaa_constructor(config, tmpdir): # dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'start_pos':1, 'end_pos':4}) # dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'start_pos':1, 'end_pos':4}) - dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) - dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) - dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) - dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':4}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) fin_path = dh.export() data_path = config['data_dir'] @@ -35,3 +36,4 @@ def test_iaa_constructor(config, tmpdir): #should be only 1 file for this case, so just run it on the only one # if there's more than 1 then you can get fancy out_df = pd.read_csv(os.path.join(iaa_out, file), encoding='utf-8') + print(out_df[['question_Number', 'agreed_Answer', 'agreement_score']]) diff --git a/consensus_and_scoring/test/test_agreement_score.py b/consensus_and_scoring/test/test_agreement_score.py index 0eb945e..e0f0a02 100644 --- a/consensus_and_scoring/test/test_agreement_score.py +++ b/consensus_and_scoring/test/test_agreement_score.py @@ -4,53 +4,31 @@ import test_utils from filegen_utils import * +from IAA import * from Dependency import * import conftest #REFERENCE: in Evidence, parents = {1.1:[2], 1.2:[2], 2.1:[4], 2.5:[4,5], 2.8:[3], 5.1:[6], 5.2:[6], 5.3:[6], 9.1:[10,11], 9.2:[10,11]} -def test_dep_sample(config): - iaa_files_path = test_utils.make_test_directory(config, 'agscore_sample') - out_path = test_utils.make_test_directory(config, 'agscore_sample_out') - # source_task_id generated by smashing keyboard - iaa = IAA_task(out_folder=iaa_files_path, source_task_id='agscore_test') - iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":1, "agreement_score":.5, 'highlighted_indices':test_utils.make_highlight_indices(10,30)}) - iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 1, "agreement_score": .5}) - iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 4, "agreed_Answer": 1, "agreement_score": .5}) - fin_path = iaa.export() - data_path = config['data_dir'] - schema_path = data_path + '/schemas' - dh_path = None #doesn't get used by dependency but is still an argument - - eval_dependency(dh_path, iaa_files_path, schema_path, out_path) +def test_sample(config): + test_path = test_utils.make_test_directory(config, 'test_agscore') + iaa_files_path = test_utils.make_test_directory(config, 'test_agscore_iaa') + out_path = test_utils.make_test_directory(config, 'test_agscore_out') + #source_task_id generated by smashing keyboard + dh = datahunt(out_folder=test_path, source_task_id = 'oogabooga') - for root, dir, files in os.walk(out_path): - for file in files: - #should be only 1 file for this case, so just run it on the only one - # if there's more than 1 then you can get fancy - out_df = pd.read_csv(os.path.join(out_path, file), encoding='utf-8') - #9 answer choices to a checklist question - # assert len(out_df) == 2 - # q_three = out_df[out_df['question_Number']==2] - # hl = q_three['highlighted_indices'].iloc[0] - # assert len(hl) >18 - # assert '10' in hl - # assert '29' in hl + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) -def test_dep_sample2(config): - iaa_files_path = test_utils.make_test_directory(config, 'agscore_sample2') - out_path = test_utils.make_test_directory(config, 'agscore_sample2_out') - # source_task_id generated by smashing keyboard - iaa = IAA_task(out_folder=iaa_files_path, source_task_id='agscore_test') - iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":1, "agreement_score":.5}) - iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":2, "agreement_score":1}) - iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 1, "agreement_score": 1}) - iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 2, "agreement_score": .5}) - fin_path = iaa.export() + fin_path = dh.export() data_path = config['data_dir'] - schema_path = data_path + '/schemas' - dh_path = None #doesn't get used by dependency but is still an argument + schema_path = data_path+'/schemas' + + iaa_out = calc_agreement_directory(test_path, schema_path, config['IAA_config_dir'], test_utils.texts_dir, outDirectory = iaa_files_path) - eval_dependency(dh_path, iaa_files_path, schema_path, out_path) + eval_dependency(test_path, iaa_files_path, schema_path, out_path) for root, dir, files in os.walk(out_path): for file in files: diff --git a/consensus_and_scoring/test/test_dep_basic.py b/consensus_and_scoring/test/test_dep_basic.py new file mode 100644 index 0000000..0d55958 --- /dev/null +++ b/consensus_and_scoring/test/test_dep_basic.py @@ -0,0 +1,52 @@ +import sys +import os +import pandas as pd + +import test_utils +from filegen_utils import * +from Dependency import * +import conftest + +#REFERENCE: in Evidence, parents = {1.1:[2], 1.2:[2], 2.1:[4], 2.5:[4,5], 2.8:[3], 5.1:[6], 5.2:[6], 5.3:[6], 9.1:[10,11], 9.2:[10,11]} +def test_dep_sample(config): + iaa_files_path = test_utils.make_test_directory(config, 'dep_basic') + out_path = test_utils.make_test_directory(config, 'dep_basic_out') + # source_task_id generated by smashing keyboard + iaa = IAA_task(out_folder=iaa_files_path, source_task_id='boogaboga') + iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":1, "agreement_score":.5, 'highlighted_indices':test_utils.make_highlight_indices(10,30)}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 1, "agreement_score": .5}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 4, "agreed_Answer": 1, "agreement_score": .5}) + fin_path = iaa.export() + data_path = config['data_dir'] + schema_path = data_path + '/schemas' + dh_path = None #doesn't get used by dependency but is still an argument + + eval_dependency(dh_path, iaa_files_path, schema_path, out_path) + + for root, dir, files in os.walk(out_path): + for file in files: + #should be only 1 file for this case, so just run it on the only one + # if there's more than 1 then you can get fancy + out_df = pd.read_csv(os.path.join(out_path, file), encoding='utf-8') + +def test_dep_sample2(config): + iaa_files_path = test_utils.make_test_directory(config, 'dep_basic2') + out_path = test_utils.make_test_directory(config, 'dep_basic2_out') + # source_task_id generated by smashing keyboard + iaa = IAA_task(out_folder=iaa_files_path, source_task_id='boogabogas') + iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":1, "agreement_score":.5}) + iaa.add_row({"namespace":"Covid_Evidence2020_03_21", "question_Number":1, "agreed_Answer":2, "agreement_score":1}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 1, "agreement_score": 1}) + iaa.add_row({"namespace": "Covid_Evidence2020_03_21", "question_Number": 2, "agreed_Answer": 2, "agreement_score": .5}) + fin_path = iaa.export() + data_path = config['data_dir'] + schema_path = data_path + '/schemas' + dh_path = None #doesn't get used by dependency but is still an argument + + eval_dependency(dh_path, iaa_files_path, schema_path, out_path) + + for root, dir, files in os.walk(out_path): + for file in files: + #should be only 1 file for this case, so just run it on the only one + # if there's more than 1 then you can get fancy + out_df = pd.read_csv(os.path.join(out_path, file), encoding='utf-8') From dcd5e4cd371721d878b97a721d8e3409836e7c6e Mon Sep 17 00:00:00 2001 From: Jamie Date: Thu, 10 Dec 2020 15:29:35 -0800 Subject: [PATCH 6/8] extended highlight weighting to work for code questions, added weights and use arguments to both weighting functions --- consensus_and_scoring/AgreementScoring.py | 115 +++++------------- consensus_and_scoring/ChecklistCoding.py | 23 +--- consensus_and_scoring/Dependency.py | 2 +- consensus_and_scoring/IAA.py | 37 ++++++ consensus_and_scoring/test/test_IAA_basic.py | 30 +++-- .../test/test_agreement_score.py | 13 +- 6 files changed, 99 insertions(+), 121 deletions(-) diff --git a/consensus_and_scoring/AgreementScoring.py b/consensus_and_scoring/AgreementScoring.py index 73a595a..17c8fb4 100644 --- a/consensus_and_scoring/AgreementScoring.py +++ b/consensus_and_scoring/AgreementScoring.py @@ -4,15 +4,23 @@ from dataV3 import create_dependencies_dict from nltk import agreement -#Takes in starts and ends of highlights for a specific question answer, returns factor to scale answer's agreement score by -def highlightAgreementScore(starts, ends): - assert len(starts) == len(ends) - if len(starts) == 0: - return 0 - if len(starts) == 1: +#Changing Agreement Scores based on Highlights +#To enable, set use=True +#To dimnish the value it scales by, set weight to a lower value +#e.g. if score = 0.5 and weight = 0.5, it scales agscore by 0.75 instead of 0.5 +def highlightAgreementScore(starts, ends, weight=1, use=True): + if not use: + return 1 + if (not isinstance(starts, list) or not isinstance(ends, list)): + print("INVALID HIGHLIGHTS") + return 1 + if len(starts) != len(ends): + print("INVALID HIGHLIGHTS") + return 1 + if len(starts) <= 1: return 1 - print("HIGHLIGHT AGREEMENT SCORING TIME!!!") + # print("HIGHLIGHT AGREEMENT SCORING TIME!!!") first_start = min(starts) last_end = max(ends) + 1 coders = [] @@ -21,7 +29,7 @@ def highlightAgreementScore(starts, ends): for i in range(len(starts)): highlights = np.zeros(last_end - first_start) highlights[[x for x in range(starts[i] - first_start, ends[i] - first_start + 1)]] = 1 - print("Highlights " + str(i+1) + ": ", highlights) + #print("Highlights for Annotator " + str(i+1) + ": ", highlights) coders.append(highlights) #Formats the codes properly as (coder,item,label) tuples @@ -31,23 +39,21 @@ def highlightAgreementScore(starts, ends): formatted_codes += [[annotator_num+1, ind, coder[ind]] for ind in range(len(coder))] ratingtask = agreement.AnnotationTask(data=formatted_codes) + #Return the average agreement score of all highlights avgAg = ratingtask.avg_Ao() - print('AVERAGE PAIRWISE AGREEMENT: ',avgAg) - # alpha = ratingtask.alpha() - # print('Krippendorff\'s alpha:',alpha) - # if alpha != 1: #other metrics error if alpha is 1 - # print('Fleiss\'s Kappa:',ratingtask.multi_kappa()) - # print('Scott\'s pi:',ratingtask.pi()) - return avgAg - -highlightAgreementScore([2, 2, 2, 2, 2], [15, 15, 15, 15, 15]) + weighted_avgAg = 1 - ((1 - avgAg) * weight) + print('Average Pairwise Agreement: ' + str(avgAg) + ', Weighted: ' + str(weighted_avgAg)) + return weighted_avgAg -#Parent Agrement Scoring -def AgreementScore(iaaData, schemaPath): +#Changing Agreement Scores based on Parent Agreement Scores +#To enable, set use=True +#To dimnish the value it scales by, set weight to a lower value +def parentAgreementScore(iaaData, schemaPath, weight=1, use=True): + if not use: + return iaaData print("PARENT AGREEMENT SCORING TIME!!!") print("OLD AGREEMENT SCORES:") print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) - #TODO: AGREEMENT SCORE CHANGES HERE schemData = pd.read_csv(schemaPath, encoding = 'utf-8') dependencies = create_dependencies_dict(schemData) iaaQuestions = iaaData['question_Number'].tolist() @@ -55,7 +61,7 @@ def AgreementScore(iaaData, schemaPath): if child not in iaaQuestions: continue parents = dependencies[child].keys() - #TODO: clean this up + #TODO: clean this bit up? temp = [] for parent in parents: answers = dependencies[child][parent] @@ -63,70 +69,9 @@ def AgreementScore(iaaData, schemaPath): parentScores = parentScores[parentScores['agreed_Answer'].astype(int).isin(answers)] temp.append(np.mean(parentScores['agreement_score'])) avgParentScores = np.mean(temp) - iaaData['agreement_score'] = np.where(iaaData['question_Number'] == child, iaaData['agreement_score'] * avgParentScores, iaaData['agreement_score']) - #iaaData['agreement_score'] = np.zeros(3) + weighted_avgParentScores = 1 - ((1 - avgParentScores) * weight) + iaaData['agreement_score'] = np.where(iaaData['question_Number'] == child, + iaaData['agreement_score'] * weighted_avgParentScores, iaaData['agreement_score']) print("NEW AGREEMENT SCORES:") print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) return iaaData - -#Agreement scoring but scores of parents of parents don't affect children -def AgreementScoreReverse(iaaData, schemaPath): - print("PARENT AGREEMENT SCORING TIME!!!") - print("OLD AGREEMENT SCORES:") - print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) - #TODO: AGREEMENT SCORE CHANGES HERE - schemData = pd.read_csv(schemaPath, encoding = 'utf-8') - dependencies = create_dependencies_dict(schemData) - iaaQuestions = iaaData['question_Number'].tolist() - reversed_keys = list(dependencies.keys())[::-1] - for child in reversed_keys: - if child not in iaaQuestions: - continue - parents = dependencies[child].keys() - #TODO: clean this up - temp = [] - for parent in parents: - answers = dependencies[child][parent] - parentScores = iaaData[(iaaData['question_Number'] == parent)] - parentScores = parentScores[parentScores['agreed_Answer'].astype(int).isin(answers)] - temp.append(np.mean(parentScores['agreement_score'])) - avgParentScores = np.mean(temp) - iaaData['agreement_score'] = np.where(iaaData['question_Number'] == child, iaaData['agreement_score'] * avgParentScores, iaaData['agreement_score']) - #iaaData['agreement_score'] = np.zeros(3) - print("NEW AGREEMENT SCORES:") - print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) - return iaaData - -#Just for testing out the differences betwene metrics -def metricTest(): - coder1 = [1,0,2,0,1,1,2,0,1,1] - coder2 = [1,1,0,0,1,1,2,1,1,0] - coder3 = [1,2,2,1,2,1,2,1,1,0] - formatted_codes = [[1,i,coder1[i]] for i in range(len(coder1))] + [[2,i,coder2[i]] for i in range(len(coder2))] + [[3,i,coder3[i]] for i in range(len(coder3))] - print('RUNNING METRIC TEST') - ratingtask = agreement.AnnotationTask(data=formatted_codes) - print('Average pairwise agreement: ',ratingtask.avg_Ao()) - print('Cohen\'s Kappa:',ratingtask.kappa()) - print('Fleiss\'s Kappa:',ratingtask.multi_kappa()) - print('Krippendorff\'s alpha:',ratingtask.alpha()) - print('Scott\'s pi:',ratingtask.pi()) -#metricTest() - -# Creates a dictionary of Parent Question: Answer: Child Questions -# ex. {1: {1: [2], 2: [2]}, 2: {1: [4], 5: [4, 5], 8: [3]}, 5: {1: [6], 2: [6], 3: [6]}, 9: {1: [10, 11], 2: [10, 11]}} -# T1.Q1.A1 changes T1.Q2, etc. -# I wrote this function and it works but didn't actually end up using it since create_dependencies_dict was better -def create_parents_dict(schemadata): - df = schemadata[schemadata['answer_next_questions'].notna()] - parents = df['answer_label'].tolist() - children = df['answer_next_questions'].tolist() - dict = {} - for i in range(len(parents)): - parent_q = int(re.findall(r"Q(\d+)", parents[i])[0]) - parent_a = int(re.findall(r"A(\d+)", parents[i])[0]) - child_q = [int(q) for q in re.findall(r"Q(\d+)", children[i])] - if parent_q not in dict: - dict[parent_q] = {parent_a:child_q} - else: - dict[parent_q][parent_a] = child_q - return dict diff --git a/consensus_and_scoring/ChecklistCoding.py b/consensus_and_scoring/ChecklistCoding.py index 8ecdd58..ebe7b72 100644 --- a/consensus_and_scoring/ChecklistCoding.py +++ b/consensus_and_scoring/ChecklistCoding.py @@ -2,7 +2,7 @@ from AgreementScoring import highlightAgreementScore #from repScores import * -def scoreChecklist(answers,numUsers, num_choices, starts, ends): +def scoreChecklist(answers,numUsers, num_choices): out = [] #print('answers', answers, num_choices) length = num_choices+1 @@ -11,26 +11,9 @@ def scoreChecklist(answers,numUsers, num_choices, starts, ends): scores = np.zeros(length) for a in answers: scores[a] = scores[a]+1 - - starts_i = {} - ends_i = {} - for i in range(len(answers)): - a = answers[i] - if a not in starts_i: - starts_i[a] = [starts[i]] - ends_i[a] = [ends[i]] - else: - starts_i[a] += [starts[i]] - ends_i[a] += [ends[i]] - print(starts_i, ends_i) - for i in range(len(scores)): #print('scores', scores, numUsers) - hlAgreeFactor = 1 - if i in starts_i: - hlAgreeFactor = highlightAgreementScore(starts_i[i], ends_i[i]) - out.append(scores[i]/numUsers * hlAgreeFactor) - + out.append(scores[i]/numUsers) return out def evaluateChecklist(answers, users, starts, ends, numUsers, length, repDF,sourceText, hlUsers, hlAns, @@ -39,7 +22,7 @@ def evaluateChecklist(answers, users, starts, ends, numUsers, length, repDF,sour repScaledAnswers, repScaledUsers = repScaleAnsUsers(answers, users, repDF, useRep=useRep) #assert len(starts) == len(users), 'starts, users mismatched' #TODO: scale numUsers when repScaled gets scaled up - percArray = scoreChecklist(repScaledAnswers, numUsers, num_choices, starts, ends) + percArray = scoreChecklist(repScaledAnswers, numUsers, num_choices) out = [] for i in range(1,len(percArray)): codingScore = percArray[i] diff --git a/consensus_and_scoring/Dependency.py b/consensus_and_scoring/Dependency.py index 8d66f2d..5880bbe 100644 --- a/consensus_and_scoring/Dependency.py +++ b/consensus_and_scoring/Dependency.py @@ -133,7 +133,7 @@ def handleDependencies(schemaPath, iaaPath, out_dir): indices = merge_indices(row_indices, indices).tolist() iaaData.at[row, 'highlighted_indices'] = json.dumps(indices) - iaaData = AgreementScore(iaaData, schemaPath) + iaaData = parentAgreementScore(iaaData, schemaPath) print('exporting to csv') path, name = get_path(iaaPath) diff --git a/consensus_and_scoring/IAA.py b/consensus_and_scoring/IAA.py index 8e06fb7..71f9fb0 100644 --- a/consensus_and_scoring/IAA.py +++ b/consensus_and_scoring/IAA.py @@ -252,6 +252,43 @@ def score(article, ques, data, config_path, text_file, schemaFile, repDF = None, elif question_type == 'checklist': out = evaluateChecklist(answers, users, starts, ends, numUsers, length, repDF, sourceText, hlUsers, hlAns, num_choices = num_choices, useRep=useRep, threshold_func = threshold_func) + + #Only change agreement score by highlights if highlights exist + if (isinstance(starts, list) and len(answers) == len(starts) and len(starts) == len(ends)): + starts_i = {} + ends_i = {} + #For this question, map all answers to their starting and ending highlights + #e.g. starts_i = {1: [5, 5]} means Answer 1 for this question has two users start highlights on index 5 + for i in range(len(answers)): + a = answers[i] + try: + if a not in starts_i: + starts_i[a] = [starts[i]] + ends_i[a] = [ends[i]] + except: + print("ERROR", a, i) + else: + starts_i[a] += [starts[i]] + ends_i[a] += [ends[i]] + print("Question", ques, "{Answer:Highlight_Starts}:", starts_i, "{Answer:Highlight_Ends}:", ends_i) + #Change each answer's agreement score based on the answer's highlighting agreement + if question_type == 'checklist': + for stuff in out: + ans_num = stuff[0] + old_ag_score = stuff[4] + if ans_num in starts_i: + hlAgreeFactor = highlightAgreementScore(starts_i[ans_num], ends_i[ans_num]) + print("Agreement Score transformed from", old_ag_score, "to", old_ag_score * hlAgreeFactor,"\n") + stuff[4] = old_ag_score * hlAgreeFactor + else: + ans_num = out[0] + old_ag_score = out[4] + if ans_num in starts_i: + hlAgreeFactor = highlightAgreementScore(starts_i[ans_num], ends_i[ans_num]) + print("Agreement Score transformed from", old_ag_score, "to", old_ag_score * hlAgreeFactor,"\n") + temp_out = list(out) + temp_out[4] = old_ag_score * hlAgreeFactor + out = tuple(temp_out) return out diff --git a/consensus_and_scoring/test/test_IAA_basic.py b/consensus_and_scoring/test/test_IAA_basic.py index 509ed46..7585ceb 100644 --- a/consensus_and_scoring/test/test_IAA_basic.py +++ b/consensus_and_scoring/test/test_IAA_basic.py @@ -14,16 +14,26 @@ def test_iaa_constructor(config, tmpdir): #source_task_id generated by smashing keyboard dh = datahunt(out_folder=test_path, source_task_id = 'oogabooga') - # dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'start_pos':1, 'end_pos':4}) - # dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'start_pos':2, 'end_pos':4}) - # dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'start_pos':1, 'end_pos':4}) - # dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'start_pos':1, 'end_pos':4}) - - dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':10, 'start_pos':1, 'end_pos':5, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + + + dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q2.A4', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':5, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q2.A5', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q2.A5', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'E', 'highlight_count':10, 'start_pos':1, 'end_pos':9, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q2.A5', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'F', 'highlight_count':10, 'start_pos':1, 'end_pos':8, 'article_text_length': 100}) + + dh.add_row({'answer_label': 'T1.Q4.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q4.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':10, 'start_pos':1, 'end_pos':8, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q4.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':10, 'start_pos':1, 'end_pos':9, 'article_text_length': 100}) + + dh.add_row({'answer_label': 'T1.Q5.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q5.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':10, 'start_pos':1, 'end_pos':5, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q5.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':10, 'start_pos':1, 'end_pos':5, 'article_text_length': 100}) fin_path = dh.export() data_path = config['data_dir'] diff --git a/consensus_and_scoring/test/test_agreement_score.py b/consensus_and_scoring/test/test_agreement_score.py index e0f0a02..d544b4c 100644 --- a/consensus_and_scoring/test/test_agreement_score.py +++ b/consensus_and_scoring/test/test_agreement_score.py @@ -16,11 +16,13 @@ def test_sample(config): #source_task_id generated by smashing keyboard dh = datahunt(out_folder=test_path, source_task_id = 'oogabooga') - dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) - dh.add_row({'answer_label': 'T1.Q2.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':3, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'B', 'highlight_count':10, 'start_pos':1, 'end_pos':5, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'C', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A2', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'D', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + dh.add_row({'answer_label': 'T1.Q1.A3', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'E', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) + + dh.add_row({'answer_label': 'T1.Q2.A1', 'namespace': 'Covid_Evidence2020_03_21', 'contributor_uuid':'A', 'highlight_count':10, 'start_pos':1, 'end_pos':10, 'article_text_length': 100}) fin_path = dh.export() data_path = config['data_dir'] @@ -35,3 +37,4 @@ def test_sample(config): #should be only 1 file for this case, so just run it on the only one # if there's more than 1 then you can get fancy out_df = pd.read_csv(os.path.join(out_path, file), encoding='utf-8') + print(out_df[['question_Number', 'agreed_Answer', 'agreement_score']]) From d25225d1b1b5b04674dd4952d218f2f6a89262bc Mon Sep 17 00:00:00 2001 From: Jamie Date: Thu, 10 Dec 2020 15:42:37 -0800 Subject: [PATCH 7/8] cleaned up code a bit, added comments --- consensus_and_scoring/AgreementScoring.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/consensus_and_scoring/AgreementScoring.py b/consensus_and_scoring/AgreementScoring.py index 17c8fb4..71f7750 100644 --- a/consensus_and_scoring/AgreementScoring.py +++ b/consensus_and_scoring/AgreementScoring.py @@ -32,14 +32,14 @@ def highlightAgreementScore(starts, ends, weight=1, use=True): #print("Highlights for Annotator " + str(i+1) + ": ", highlights) coders.append(highlights) - #Formats the codes properly as (coder,item,label) tuples + #Formats the codes properly as (coder,item,label) tuples (required by avg_Ao) formatted_codes = [] for annotator_num in range(len(coders)): coder = coders[annotator_num] formatted_codes += [[annotator_num+1, ind, coder[ind]] for ind in range(len(coder))] ratingtask = agreement.AnnotationTask(data=formatted_codes) - #Return the average agreement score of all highlights + #Return the weighted average agreement score of all highlights avgAg = ratingtask.avg_Ao() weighted_avgAg = 1 - ((1 - avgAg) * weight) print('Average Pairwise Agreement: ' + str(avgAg) + ', Weighted: ' + str(weighted_avgAg)) @@ -54,14 +54,21 @@ def parentAgreementScore(iaaData, schemaPath, weight=1, use=True): print("PARENT AGREEMENT SCORING TIME!!!") print("OLD AGREEMENT SCORES:") print(iaaData[['question_Number', 'agreed_Answer', 'agreement_score']]) + + #Get a dictionary of children and parents schemData = pd.read_csv(schemaPath, encoding = 'utf-8') dependencies = create_dependencies_dict(schemData) iaaQuestions = iaaData['question_Number'].tolist() + + #For each child, if present in the iaaData, calculate a new agreement score for child in dependencies.keys(): if child not in iaaQuestions: continue parents = dependencies[child].keys() + #TODO: clean this bit up? + #Children can have multiple parent questions that each can have multiple parent answers + #For each parent question, assign each parent answer score to parentScores, then append the mean score to temp temp = [] for parent in parents: answers = dependencies[child][parent] From 0d40af14b4edfb43b0c403cf67877b99adc6dfea Mon Sep 17 00:00:00 2001 From: Jamie Date: Thu, 10 Dec 2020 15:49:16 -0800 Subject: [PATCH 8/8] minor change --- consensus_and_scoring/IAA.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/consensus_and_scoring/IAA.py b/consensus_and_scoring/IAA.py index 71f9fb0..56faa85 100644 --- a/consensus_and_scoring/IAA.py +++ b/consensus_and_scoring/IAA.py @@ -261,12 +261,9 @@ def score(article, ques, data, config_path, text_file, schemaFile, repDF = None, #e.g. starts_i = {1: [5, 5]} means Answer 1 for this question has two users start highlights on index 5 for i in range(len(answers)): a = answers[i] - try: - if a not in starts_i: - starts_i[a] = [starts[i]] - ends_i[a] = [ends[i]] - except: - print("ERROR", a, i) + if a not in starts_i: + starts_i[a] = [starts[i]] + ends_i[a] = [ends[i]] else: starts_i[a] += [starts[i]] ends_i[a] += [ends[i]]