pyth/articles.py

import psycopg2
import numpy as np
from dotenv import load_dotenv
import os
from openai import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
import json
from json_repair import repair_json

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()

print(f"Checking for similar!")

def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
    try:
        titles, links, embeddings = get_titles_links_embeddings()

        processed_articles = set()
        grouped_similar_articles = []

        for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
            if (title1, link1) not in processed_articles:
                processed_articles.add((title1, link1))
                group = [(title1, link1)]

                for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
                    if i != j and (title2, link2) not in processed_articles:
                        similarity = calculate_cosine_similarity(embedding1, embedding2)

                        if similarity > threshold:
                            processed_articles.add((title2, link2))
                            group.append((title2, link2))

                grouped_similar_articles.append(group)

        return grouped_similar_articles

    except psycopg2.Error as e:
        print(f"Error: {e}")
        return []
    
def processing_similar():
        grouped_similar_articles_result = find_and_group_similar_articles()

        if grouped_similar_articles_result:
            for group in grouped_similar_articles_result:
                articles = []

                if len(group) > 1:
                    for article_tuple in group:
                        if len(article_tuple) >= 2:
                            title, link = article_tuple[:2]
                            article = [title, link]
                            articles.append(article)
                    l = len(articles)

                    if l == 2:
                        a_one = articles[0][0]
                        a_two = articles[1][0]

                        get_one = get_specific_data(a_one)
                        get_two = get_specific_data(a_two)

                        text1 = get_one[0][1]
                        text2 = get_two[0][1]
                        link1 = get_one[0][2]
                        link2 = get_two[0][2]
                        if link1 != link2:
                            link = f"{link1}, {link2}"
                        else:
                            link = link1

                        ftoks = num_tokens_from_string(text1)
                        stoks = num_tokens_from_string(text2)
                        tokens = ftoks + stoks

                        similar_d = f"C: {a_one}, {a_two}"

                        modify_similar_data(similar_d, a_one)
                        preparing_articles(False, a_one)

                        modify_similar_data(similar_d, a_two)
                        preparing_articles(False, a_two)

                        if tokens > 2000:
                            combined_text = f"{text1} {text2}"
                            combined_text = slice_text_at_2k_tokens(combined_text)
                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
                        else:
                            user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."

                    if l == 3:
                        a_one = articles[0][0]
                        a_two = articles[1][0]
                        a_three = articles[2][0]

                        get_one = get_specific_data(a_one)
                        get_two = get_specific_data(a_two)
                        get_three = get_specific_data(a_three)

                        text1 = get_one[0][1]
                        text2 = get_two[0][1]
                        text3 = get_three[0][1]
                        link1 = get_one[0][2]
                        link2 = get_two[0][2]
                        link3 = get_three[0][2]
                        if link1 != link2:
                            if link2 != link3:
                                link = f"{link1}, {link2}, {link3}"
                            else:
                                link = f"{link1}, {link2}"
                        else:
                            if link2 != link3:
                                link = f"{link1}, {link3}"
                            else:
                                link = link1
                        ftoks = num_tokens_from_string(text1)
                        stoks = num_tokens_from_string(text2)
                        ttoks = num_tokens_from_string(text3)
                        tokens = ftoks + stoks + ttoks

                        similar_d = f"C: {a_one}, {a_two}, {a_three}"
                        modify_similar_data(similar_d, a_one)
                        preparing_articles(False, a_one)

                        modify_similar_data(similar_d, a_two)
                        preparing_articles(False, a_two)

                        modify_similar_data(similar_d, a_three)
                        preparing_articles(False, a_three)

                        if tokens > 2000:
                            combined_text = f"{text1} {text2} {text3}"
                            combined_text = slice_text_at_2k_tokens(combined_text)
                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
                        else:
                            user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
                    if l == 4:
                        a_one = articles[0][0]
                        a_two = articles[1][0]
                        a_three = articles[2][0]
                        a_four = articles[3][0]

                        get_one = get_specific_data(a_one)
                        get_two = get_specific_data(a_two)
                        get_three = get_specific_data(a_three)
                        get_four = get_specific_data(a_four)

                        text1 = get_one[0][1]
                        text2 = get_two[0][1]
                        text3 = get_three[0][1]
                        text4 = get_four[0][1]
                        link1 = get_one[0][2]
                        link2 = get_two[0][2]
                        link3 = get_three[0][2]
                        link4 = get_four[0][2]

                        if link1 != link2:
                            if link2 != link3:
                                if link3 != link4:
                                    link = f"{link1}, {link2}, {link3}, {link4}"
                                else:
                                    link = f"{link1}, {link2}, {link3}"
                            else:
                                if link3 != link4:
                                    link = f"{link1}, {link2}, {link4}"
                                else:
                                    link = f"{link1}, {link2}"
                        else:
                            if link2 != link3:
                                if link3 != link4:
                                    link = f"{link1}, {link3}, {link4}"
                                else:
                                    link = f"{link1}, {link3}"
                            else:
                                if link3 != link4:
                                    link = f"{link1}, {link4}"
                                else:
                                    link = link1

                        ftoks = num_tokens_from_string(text1)
                        stoks = num_tokens_from_string(text2)
                        ttoks = num_tokens_from_string(text3)
                        frtoks = num_tokens_from_string(text4)

                        tokens = ftoks + stoks + ttoks + frtoks

                        similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
                        modify_similar_data(similar_d, a_one)
                        preparing_articles(False, a_one)

                        modify_similar_data(similar_d, a_two)
                        preparing_articles(False, a_two)

                        modify_similar_data(similar_d, a_three)
                        preparing_articles(False, a_three)

                        modify_similar_data(similar_d, a_four)
                        preparing_articles(False, a_four)

                        if tokens > 2000:
                            combined_text = f"{text1} {text2} {text3} {text4}"
                            combined_text = slice_text_at_2k_tokens(combined_text)
                            user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
                        else:
                            user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
                    try:
                        completion = client.chat.completions.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                                {"role": "user", "content": user_message}
                            ])
                        generated_text = completion.choices[0].message.content

                        generated_text = repair_json(generated_text)

                        response_data = json.loads(generated_text)
                        title = a_one
                        text = response_data["content"]
                        vector = embeddings.embed_query(generated_text)

                        insert_data(title, text, link, vector, similar_d)
                        print(f"Inserting combined: {title}")

                    except Exception as e:
                        print(f"Error: {e}")
                        print(a_one)
                        continue
            else:
                print("Done!.")
        else:
            print("No similar articles found.")
if __name__=="__main__":
    processing_similar()
added article.py 2024-01-06 08:17:05 +01:00			`import psycopg2`
			`import numpy as np`
			`from dotenv import load_dotenv`
			`import os`
organizing code 2024-01-07 03:41:32 +01:00			`from openai import OpenAI`
added article.py 2024-01-06 08:17:05 +01:00			`from langchain.embeddings import OpenAIEmbeddings`
organizing code 2024-01-07 03:41:32 +01:00			`from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings`
added article.py 2024-01-06 08:17:05 +01:00			`from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens`
			`import json`
Fixed response/JSON 2024-01-08 00:28:20 +01:00			`from json_repair import repair_json`
added article.py 2024-01-06 08:17:05 +01:00
			`load_dotenv()`

			`OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")`
			`client = OpenAI()`
			`embeddings = OpenAIEmbeddings()`

			`print(f"Checking for similar!")`

			`def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):`
			`try:`
organizing code 2024-01-07 03:41:32 +01:00			`titles, links, embeddings = get_titles_links_embeddings()`
added article.py 2024-01-06 08:17:05 +01:00
organizing code 2024-01-07 03:41:32 +01:00			`processed_articles = set()`
			`grouped_similar_articles = []`
added article.py 2024-01-06 08:17:05 +01:00
organizing code 2024-01-07 03:41:32 +01:00			`for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):`
			`if (title1, link1) not in processed_articles:`
			`processed_articles.add((title1, link1))`
			`group = [(title1, link1)]`
added article.py 2024-01-06 08:17:05 +01:00
organizing code 2024-01-07 03:41:32 +01:00			`for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):`
			`if i != j and (title2, link2) not in processed_articles:`
			`similarity = calculate_cosine_similarity(embedding1, embedding2)`
added article.py 2024-01-06 08:17:05 +01:00
organizing code 2024-01-07 03:41:32 +01:00			`if similarity > threshold:`
			`processed_articles.add((title2, link2))`
			`group.append((title2, link2))`
added article.py 2024-01-06 08:17:05 +01:00
organizing code 2024-01-07 03:41:32 +01:00			`grouped_similar_articles.append(group)`
added article.py 2024-01-06 08:17:05 +01:00
organizing code 2024-01-07 03:41:32 +01:00			`return grouped_similar_articles`
added article.py 2024-01-06 08:17:05 +01:00
			`except psycopg2.Error as e:`
			`print(f"Error: {e}")`
			`return []`

			`def processing_similar():`
			`grouped_similar_articles_result = find_and_group_similar_articles()`

			`if grouped_similar_articles_result:`
			`for group in grouped_similar_articles_result:`
			`articles = []`

			`if len(group) > 1:`
			`for article_tuple in group:`
			`if len(article_tuple) >= 2:`
			`title, link = article_tuple[:2]`
			`article = [title, link]`
			`articles.append(article)`
			`l = len(articles)`
organizing code 2024-01-07 03:41:32 +01:00
added article.py 2024-01-06 08:17:05 +01:00			`if l == 2:`
			`a_one = articles[0][0]`
			`a_two = articles[1][0]`

			`get_one = get_specific_data(a_one)`
			`get_two = get_specific_data(a_two)`

			`text1 = get_one[0][1]`
			`text2 = get_two[0][1]`
			`link1 = get_one[0][2]`
			`link2 = get_two[0][2]`
			`if link1 != link2:`
			`link = f"{link1}, {link2}"`
			`else:`
			`link = link1`

			`ftoks = num_tokens_from_string(text1)`
			`stoks = num_tokens_from_string(text2)`
			`tokens = ftoks + stoks`

			`similar_d = f"C: {a_one}, {a_two}"`

			`modify_similar_data(similar_d, a_one)`
			`preparing_articles(False, a_one)`

			`modify_similar_data(similar_d, a_two)`
			`preparing_articles(False, a_two)`

			`if tokens > 2000:`
			`combined_text = f"{text1} {text2}"`
			`combined_text = slice_text_at_2k_tokens(combined_text)`
			`user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"`
			`else:`
			`user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."`

			`if l == 3:`
			`a_one = articles[0][0]`
			`a_two = articles[1][0]`
			`a_three = articles[2][0]`

			`get_one = get_specific_data(a_one)`
			`get_two = get_specific_data(a_two)`
			`get_three = get_specific_data(a_three)`

			`text1 = get_one[0][1]`
			`text2 = get_two[0][1]`
			`text3 = get_three[0][1]`
			`link1 = get_one[0][2]`
			`link2 = get_two[0][2]`
			`link3 = get_three[0][2]`
			`if link1 != link2:`
			`if link2 != link3:`
			`link = f"{link1}, {link2}, {link3}"`
			`else:`
			`link = f"{link1}, {link2}"`
			`else:`
			`if link2 != link3:`
			`link = f"{link1}, {link3}"`
			`else:`
			`link = link1`
			`ftoks = num_tokens_from_string(text1)`
			`stoks = num_tokens_from_string(text2)`
			`ttoks = num_tokens_from_string(text3)`
			`tokens = ftoks + stoks + ttoks`

			`similar_d = f"C: {a_one}, {a_two}, {a_three}"`
			`modify_similar_data(similar_d, a_one)`
			`preparing_articles(False, a_one)`

			`modify_similar_data(similar_d, a_two)`
			`preparing_articles(False, a_two)`

			`modify_similar_data(similar_d, a_three)`
			`preparing_articles(False, a_three)`

			`if tokens > 2000:`
			`combined_text = f"{text1} {text2} {text3}"`
			`combined_text = slice_text_at_2k_tokens(combined_text)`
			`user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"`
			`else:`
			`user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."`
organizing code 2024-01-07 03:41:32 +01:00			`if l == 4:`
			`a_one = articles[0][0]`
			`a_two = articles[1][0]`
			`a_three = articles[2][0]`
			`a_four = articles[3][0]`

			`get_one = get_specific_data(a_one)`
			`get_two = get_specific_data(a_two)`
			`get_three = get_specific_data(a_three)`
			`get_four = get_specific_data(a_four)`

			`text1 = get_one[0][1]`
			`text2 = get_two[0][1]`
			`text3 = get_three[0][1]`
			`text4 = get_four[0][1]`
			`link1 = get_one[0][2]`
			`link2 = get_two[0][2]`
			`link3 = get_three[0][2]`
			`link4 = get_four[0][2]`

			`if link1 != link2:`
			`if link2 != link3:`
			`if link3 != link4:`
			`link = f"{link1}, {link2}, {link3}, {link4}"`
			`else:`
			`link = f"{link1}, {link2}, {link3}"`
			`else:`
			`if link3 != link4:`
			`link = f"{link1}, {link2}, {link4}"`
			`else:`
			`link = f"{link1}, {link2}"`
			`else:`
			`if link2 != link3:`
			`if link3 != link4:`
			`link = f"{link1}, {link3}, {link4}"`
			`else:`
			`link = f"{link1}, {link3}"`
			`else:`
			`if link3 != link4:`
			`link = f"{link1}, {link4}"`
			`else:`
			`link = link1`

			`ftoks = num_tokens_from_string(text1)`
			`stoks = num_tokens_from_string(text2)`
			`ttoks = num_tokens_from_string(text3)`
			`frtoks = num_tokens_from_string(text4)`

			`tokens = ftoks + stoks + ttoks + frtoks`

			`similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"`
			`modify_similar_data(similar_d, a_one)`
			`preparing_articles(False, a_one)`

			`modify_similar_data(similar_d, a_two)`
			`preparing_articles(False, a_two)`

			`modify_similar_data(similar_d, a_three)`
			`preparing_articles(False, a_three)`

			`modify_similar_data(similar_d, a_four)`
			`preparing_articles(False, a_four)`

			`if tokens > 2000:`
			`combined_text = f"{text1} {text2} {text3} {text4}"`
			`combined_text = slice_text_at_2k_tokens(combined_text)`
			`user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"`
			`else:`
			`user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."`
added article.py 2024-01-06 08:17:05 +01:00			`try:`
			`completion = client.chat.completions.create(`
			`model="gpt-3.5-turbo",`
			`messages=[`
			`{"role": "system", "content": "Data analytic, Journalist and News reporter"},`
			`{"role": "user", "content": user_message}`
			`])`
			`generated_text = completion.choices[0].message.content`

Fixed response/JSON 2024-01-08 00:28:20 +01:00			`generated_text = repair_json(generated_text)`

added article.py 2024-01-06 08:17:05 +01:00			`response_data = json.loads(generated_text)`
			`title = a_one`
			`text = response_data["content"]`
			`vector = embeddings.embed_query(generated_text)`

			`insert_data(title, text, link, vector, similar_d)`
			`print(f"Inserting combined: {title}")`

			`except Exception as e:`
			`print(f"Error: {e}")`
organizing code 2024-01-07 03:41:32 +01:00			`print(a_one)`
added article.py 2024-01-06 08:17:05 +01:00			`continue`
organizing code 2024-01-07 03:41:32 +01:00			`else:`
			`print("Done!.")`
added article.py 2024-01-06 08:17:05 +01:00			`else:`
			`print("No similar articles found.")`
			`if __name__=="__main__":`
			`processing_similar()`