| | import os |
| | import argparse |
| | import json |
| | from tqdm import tqdm |
| |
|
| |
|
| | def load_url_text_map(knowledge_store_dir, claim_id): |
| | url_text_map = {} |
| | knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json") |
| |
|
| | if os.path.exists(knowledge_file): |
| | with open(knowledge_file, "r") as f: |
| | for line in f: |
| | data = json.loads(line) |
| | url = data["url"] |
| | url2text = data["url2text"] |
| | concatenated_text = " ".join(url2text) |
| | url_text_map[url] = concatenated_text |
| |
|
| | return url_text_map |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser( |
| | description="Add scraped_text field to the prediction file." |
| | ) |
| | parser.add_argument( |
| | "-i", |
| | "--veracity_prediction_file", |
| | default="data_store/dev_veracity_prediction.json", |
| | help="Json file with the veracity predictions.", |
| | ) |
| | parser.add_argument( |
| | "-o", |
| | "--output_file", |
| | default="data_store/dev_veracity_prediction_for_submission.json", |
| | help="Json file with the veracity predictions and the scraped_text.", |
| | ) |
| | parser.add_argument( |
| | "--knowledge_store_dir", |
| | type=str, |
| | help="Directory of json files of the knowledge store containing url2text.", |
| | ) |
| | args = parser.parse_args() |
| |
|
| | predictions = [] |
| | with open(args.veracity_prediction_file) as f: |
| | predictions = json.load(f) |
| |
|
| | for claim in tqdm(predictions, desc="Processing claims"): |
| | claim_id = claim["claim_id"] |
| | url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id) |
| |
|
| | |
| | for evidence in claim["evidence"]: |
| | url = evidence["url"] |
| | scraped_text = url_text_map.get(url) |
| | if scraped_text: |
| | evidence["scraped_text"] = scraped_text |
| | else: |
| | print( |
| | f"Warning: No scraped text found for claim_id {claim_id} and url {url}" |
| | ) |
| |
|
| | with open(args.output_file, "w", encoding="utf-8") as output_file: |
| | json.dump(predictions, output_file, ensure_ascii=False, indent=4) |
| |
|
| | print(f"Updated JSON saved to {args.output_file}") |
| |
|