From b7a0e5478c4999da1aa5f6966f09cf4159c1e51b Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Sun, 7 Jan 2024 03:41:32 +0100 Subject: [PATCH] organizing code --- .../scrapingsingle.cpython-310.pyc | Bin 6614 -> 4361 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 6045 -> 6449 bytes pyth/articles.py | 173 +++++++++--------- pyth/scrapingsingle.py | 16 +- pyth/vectData.py | 168 +++-------------- 5 files changed, 122 insertions(+), 235 deletions(-) diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 34597ac35e738ff6dec6066b9f60a885ea55899e..2598ed55e69e99b6c90bdb13b4eeebcdb0e7a35c 100644 GIT binary patch delta 1035 zcmYLH-ES0C6rXc;pGBzw>=_&hOs8PJex( z6c>xOq0aZtef+6wzw`z?*&q4^#`pjEvEySLsv4jDEcT!!x=8n(>E_wyIqJ|pdXn~k zN9X`Oh3`Eu=parhtUS!=Xr0iv?~&RJ&NR>(PEmph()a81difGIzr#Mm)iqAMMrCw(821Jb^L`cUE(Tk9}32r51y|3I#Mv}&a zl5r)cS-<$bu%bHOboz^-O25#Z-3D6~*PPzjYsz8QHLNJ85H_G~y+M2Cbns)^T~!ck zGb4wX_L6uM09fVKNg$2|KQD=+iKw;f#h+ zk?(%XT-N4-80)@#XHuIp8mbyneipSgt3aAbkc4XAA??_ylRmr5>Br}4Wu@AYtl#hw zS$MUpn#*VCB`EDzLD}Iq;=o5PJKAK{G38N~>KGwY)$M7)V74GVp>FN0&|9cYubbh@iPByU;2PNr(?9?x{! zRqZ$)RWmD@3x`0WmLN+u;D}Q=pukETxWOR;gb;85sgE2$IYk1YMJp6|uV%)q!g69p z^}ScGUcG+x>eZ|I`p!R0W!+3Bslo5#vk%y_z4x<2Pe~y_O3?t`3u)CtUPVeHL|ABNjmj4ZT=$F=d|%7AN@(6h>+(&&iHY3H$)~l zneO{2G|rL22=u~qxyDKkqeYM1@pu75d_0&o_mcy`qUn%5!5>XYjs|ZheoKxAdy)r- z)hx4OJG5A;)p!Jr7^UT9=1@Kv@Z?Np8bz_1?JNm?B=}kKI=L7Or)J1l@Lp=y=L4V`nsp{nOYo~Wy$JA^8{ho--(S1&;^Y$&CcH(?#G+fHp?>M| zCw;S=yWFxJtL)a7Ys|AaGPs)TO zw>5rQ5}7Dyn-Ljr%2-8z3|={L8ApDB;NWphCY36MUQ(%=(%8~eywXg9CfSTW*5Kg% z;)d?0{InO7=?y|-OFI9nOrx_P&-hv3!8zb_eja#O4Dih^zuPlp23FhyGzYX7Xcy2v z*&};pUnS!A%YJGsY5dnR^H7sHXkBnXBL09Emjj?VK!L*PH9&bk`AuaJWVE4-sT{!`Zjc;No_MPRn|aG_v4te7q|hLz%?mdjSkA>R&Q&jmxFZt zw2dYnh>npca0NBJYqML9JSP6AO!z4|O1DAwT3*j)uafTY`eaI4vLJWR?Q}?0*q_j0 zf2Y^qk;|R!0-K6z-T)NX-x1BYU}8wM*j37 zD=*p3f+g&_T`O@;;6OiDfHmql)<-@B(8!!;oLRQ8u+mzMg$XPUw`{1rR)xEDYswPt zDlaoJVO8v!$2cf9?|2hd$)Q%gw8SijtqGXqc+3GYXWlAztdg)U%w4`@xlV1}TC=@H zYpks@26kADsv6qpQeQ8D2UE%cVKrjPa^t&Oj#%Czvnp<_=C0w2z|T{jfGXy>Rtdhl z$b@~n?FDCb7xAUwOt^t>nRKK>d~re&9MqP|J)r|CehA{yPejpAm)sHwaCL z!lwc-*$x2DcDln+tE+YM65jVoZVS zn`>>mRkMYMo6G9U-nzOUco|JmfRns}>;k~}U>IBG5K^8%5x!OU3_uvO;pP#%4+Yy1 z@PY`9a?NH=@T1&tGhSYFZOD!=-oA=jAIT`S)^>!^s&E~+dfgHgVUpEvF-l=4MVL~2 zab=b9b#9{hIL=GV-YK(Xyq(5(hq-nJ7fal;;Z+c(7r48+3{L}E@=DE67o{*>!3Y$+ z&=e{Hz!`Ya7b|TlLZhOd5^$NeXdL>s2j=1+Kt|o5q2aB<#VEoUX33$}a{&nUCR;rq z*qdtgnp0uiJ5wa{F;P#9(74UYp3B2TCw?>}?3ITHJhQ-d zv90kv@IX8TFm;1O^<=~#=~w=y-b=E23-daRE-3|5H;GA-(DKmIWRUa|Lm!N^XlE3S zB|*c`yCW^CgElmnt-my0X5)sAHj?U})w695WO`DMlHf(Ye?)&2{TKfboE#_Wpp1Vx zjNB53IaW3PMlh1~V*EDn!GrARQSPGj4nSzo*30iCb|S*#VFc5CE2sGZR6UAtF!)Dy zhj9_vGr?%?;uvlXKY=idpk8oikvjzdDTC=F_(X6&w_~2;JCLi#+zN6W;adm-!9&1j z8vi!J8o~nv^?qAN?gNDHAT$sjBH)69xANCWckm=Xakw{vQPweT5S#vX^kBunn4<{7-{yOwq{L?}3HAktbB~_)e?r1f=q|375 z7_~?#BFm~1Eyd(JX30bzapJW^DIxo!&QNWrG*nBLlB&WJdkRu>f0)Ids-=`HnJA6O zQXHjGSxTUEN|uID8so~t(>%pT_$WWc$5`^qNa+k4W~tAW(o1ZFjUqqG-eP0y^yg~n z96Q5aLd!Ti%g!O6VB>58`OEBOb{_e8HpyN=KFMBXuOWYhS?mJxSJ~_ABJ$VR8|)Ht zi(O_{kY8X^ERFni_9pul@{26Pt|EVfO|u!~m-r=;im3%9eeD_PG?!L=-F@sJ8xee? z>Uvz1w|K>C2sE9lG~8;Pm)&Zu>e!;{ZKH9>Y1pjH8Xm82r8VDVw#U6{jr(y*8FxI} zk1f7ezOzy&%EG;sd{9^|Wb^(Iy6(*v3LmcI7iD*DemR?t_{L_#^?ak*5J+1t7rxoF zUH3^tFh9!H>UND+H2UPuFM`~_vw+9FiKMM~%0qQuYb#IH^UAK)Ry%c7)^$&BYa41? z-%$57rcOW$6Uwg9HlPbFCjccSZTMQf=|`Pv-F7y7quvlTL2Kaiw5@A+uKUAV*UHpx zJJlV|P`%z;O&n_KKoKwF)o*UhY&L3q#&+#ByI!4HnoU}t zX>NO)Go<=O+q0*e+y42A?NnM0)LTBX4>YxFA{kXj)w8Ohib+%|Bt`fmWpx#F2NjQ0 z1mVRvQb|dWwHW}NhDk$k_ ziMFGQ$|ozkv386B0=uSX>_<@6CtyGOQH+SAW}qkG#oF<vCnAH+sQq!tQTjIw(^z2qHXgllp!S=UgG6HVZ6f10Ah8yJ;n?i#vu)lS_3lkq4mNv@ZHuZl$<0SpXARHX17qw;Jn~ zOLe!=5*2>oc6!J+Dvf&Gk5*d3Z3sWcKjW2_$9;31dzDSwaU^KO6jt|R>#%O{s5D-G z2oS?!W%?99(geIcKT>fTE>92p5hFW)BT4ATyAp$XHwAY$8x9lKY2wrTGgt$e zNq3Cvj~{I;d#$F!{ZzLnSY@nR@4gNkhf~txVuF^5cj&In2H`wXbwjTq`xX)t zC^ms%QyoJt%fR%gdPYqVw$-TYi>lf`QYjNHVi7M@@N|DVO7sxKpTCKu191@guG$8d z5qkkjiiA1b!M;I2YlDP9^gG2q&~ybj9xv91ce44cbvOzLu$FSx)Ro-Im5gKiQoNyF941p$+bZYRlqXY5AmuP)}kB zt$o@c;}5mr-bW?W25F*W@PQ^sH&PQ!V>;MB;Cqc3JvE(0dTNF=YGYl8=AyZs_xm(w z5ezbiWcoq{3U0G)>w8vFwD|0a&f|Hn^JuF~nNR}fai9ivOJ46l6R#rkbC zKIi-k6tqlhj3Nn*G^M3rq6tmp(b(s6$lp-JP6p}TLK6BMPYXTGzQF*$j+Zdn5vHxj zo@($5+I~bDXOuh+d;xTn)^j^QPgtGbQVu~Q*<5ku2%B3|Qq+t^q?j25WaiLpF#@4! z#*#2hcjvx8e|I%ou%<4#8LR)XGM8L>(z9}KF&z_82!?{y&)GJ zGBeX?+mGdDJ@?H3!}M-zz#zvu^$tmg9COK?MVx66)krY~vEra;DH2V-AqA{q$a4y_ zh-{POpvd+75Dg~^C_X|juqr;L9D(g^grP53afDV3!UVZw3$KX>lzfV$?`T6hB)*^x zeolcxM~~q1T^wTI;dC;lb*W4AA)M=&4uCMXdT(*Qm_2GNg>3QABEZE!c;MZn5d{qg z#sInnG1`IQ$Lst_8F4gpDgjPWMgW`v%$%+@Sap3n^kP!fACnw(SRt*BLcfBvdZNgp zj9$X@jTeb>xZOv^5I@2U!lq>4^TwKjBhseGs1w4uU(hf@xZp_oh?_Pd1hhMiN@pWf zYy`p#Yy`rKbyY%Im0YKzH3&DzX+nkveRkQ$RNY9R$BTA2>Ugs~B51(=#=w^N!)}vT zs_WHCxrsCNpbAb&AciBzynhPGx}ioYME?e915fv-qeRaF9+%>qqvlP4S0}uXu#`ti z=%^lTH?B_{nD$Xi0K;XfQGy0*PE+%&*+Diu3Q?&00&ht?Wljg<|0d z!_OiP?2k<(dk1o9CrguHqR}jS_L{^0K;K8P90@~*4hYjgwNDro4P9XfK_L&|v$4U% z0iNz5sFjXhgUhFukOPz;;tM0aOAGldxvT6)#cZyyw2~tM>6Fzcq@BZ}-MFpuN>?98 zh94gIt*P!?7Jj$}TB~lrO;>Hlx|d&Cp3i@5eUSY)(>E%xnEUhjg*)^4fi6Uv*0DX3 z%#Q8p2H>3?o4WR$x8F(k&)f9{$4^dyqg7m5&K8RE%lE9s?Cts0yG3hZHJ{Jsisf!k z|D=J7d~aptZgxJ`m9P`jrlW&kp*`(EjCx$8>c=aX6=cBH{)pinb|a+VNdhfRwBr0i z&N6)&SGewWrP18DF31A>Q_YRwMrOL(PK)~E?jsnKs8828ZuKhebknYiCsphd@uDBW zomq1`xNOrJF0O;+<+Gp|DKKoZX+H|`#htXEJ1~)=WP}nr#E4T!{21NL;DR*}e3IJb zIct$hbh4Ib8yRp53T|ZpD8n-uu*mp8kXw`-E?G~Sx-=0fc@K3;%8)mj<{V<8mcwt6 z;kY>@rjmqZ1#xB)5m8i2XbEVL{)VQh;XnE>(=?-rxH*Y@*i4vln+a literal 6045 zcmb7ITXWmS6~>Juc-6&px#+zeLv4qPvk~Hv~;v4 zL@SNfq-bT(ni8!nTGN*F#WAa3O<0rGlr_zAj}wg)ZFQGocj;Dfxt`sznx5UZ9h2Ms4mz`rYqBPDeXFxwR6?DZzUA9(D@;?#EXOy))Y6US?MkgK z8mpCR)VNnGSHmoZRu^lvk1N$BF}$*PxBN5_YMZX-hg#R=D7QR|hkDoayoWAlVUn$N z%(m51Xvw`_0=0(YCJyf%6oKSRU&#APAU%;!N_$Em_s(T;uKH@AY{-GSAwN=>JPQ;J~7hmyJU-J`wlF5$~dx;>ipL!zgC4=O?&Xgx$30i6Nq=M9v^!e(Z9_UO3Vrf6K zpG8}n1E!JNZHE;xj?(GaXV0g?0uF0DoF3;{X!rGGJ4b?{|Nf? zCs;C&zL8ieNPi=Js}VENr;EPyx2j-jW-pI*Og@47z}sw%X5?sOxc9+q|H^NbgQF}D z>lRjc9&f#SrB=REzF9YX+jp#z)n2n0vpX9lnESqQt6I5hM8qGM=Bl8@cf^hFh>3{>AjK(LZn51HgifI&KvbDuMmxn3qORKfzTcN&g z`K?XUariMBPnevWJ7H=a#DPQwFT$)X7S^O?hlwt?JARmGIj(0F3t>V45^BEnC8S4a zgy|R^sCtuI-lprYP@~21^YoNs)|c=Vc+qA&(+f`=?ri#7UB?Ow@ksQn>G(XpJj};^ zv;_VNJtp1LQEB`7KXoSCoTeNV*Khl3+J0a)EUy@I7gs75OUA`o`C{>DmJ<;C92G}V zNLY|xpee6Y@g|C*#xGM_7&q}?%i=p>E`A(XGm3w9!}9wExP>b{vezi~T8ub<7sEqd ziJTWjcHxz(XpF%%;ivX>;gu47U$mcL@bEo9q6JB&uBV;qi zUuTToZ+QhVh@h)i4yoOki#&rtLMR!;!jcj5vot`z1Q*h_N(Q$~wo{Dn{n6r`d*zxj zcfl(e!-rA2;L%aMDkj&9`4=%$5}<@?$9*_37}W2!+qNIsOy?y`;g_hGr(%JMpHV@^ z#N;L|G`gH3#y~&3h|`a8c%-ILyXk6DnN+p^>WQ2pW*(^s{x&X;Ielq}(*nllIai9> zPY)1}Uqwf#k&_ywn@)72fi|Wb#WhkoX_zQ=3;S3TBN{agSNwe=zjM zL(CEEc&?nI8SjEDSCLm~bq?7PCVY{KVS|MQLO@(0aM;6?XX7=O>*MeN<{XIwU&3f# zA-qf{a+tVfI*4FLE@DEcUIa3+ipRK&-=^Z{C=TUkvERIg(J`1Sfg=2-3cslZD*UEK z+Fn-#?h0wXr{uoZymoJOX|Y~Dtle6vpTz#r9(qnJGc{{tR+If4~cS<7HP6sRZ0l4Rel^okwaJ1KAk&LrFMOnf4VV zK~h+f9?5|7U^0>D1-P?4;EwjTLpG#bmhu`}sW*dS6OaEqxd*#3X;%7;@<4%AWRd*} zOt8;W=6@6gnGyD+uX)Hn-FEl-pXqLN>lta+AmosfHfcrP{58kLX2!cdA5V^qa>)7X zY%RKE)D-!l8~%wNHQo#vc^du?W}xUc^w#voF>}eqjWNUjW$uVSy0iqw!XJ&@J{%HZ z@drAfUS{?JUS|UPZVMa5KArzU011yH4|yD2lM6~t=ARHaIO5+BICXT70Y{3`F+a@| zraqQWNFao4^g zd^n{}oRW1;1|Q^4oLu_BgqJ>F>@*#>vk^_A3}lpgX>8NX4nBlz{1c-Q?S}mLw5}_f zLiz`?zr>s&WL0#;wEKY@2q_AIea(xcw(l%oU#eDCjr!t^J7r_}mQns>xmK?o3XE5a zKX6MhKu8^I3C2dc4C8LM>6>egr6EFiC-9;pnWU~M*mDsqpAjqv;|Mv57#506;|SN^ zR{?hj&i>ZitsiM7?mTyAgfvneTsm;k1PCgm)iu=bibs<#W5`&38?k zKeS;~a*<)8v(@hIM4ws|*Tbht@gbB`TEvChAyS7)ge!a>J$es*j^>L@@-(&RvxC?p zi|v-!!SD~LkMe4fABs#&#~0Kt@};P``$Rbjz`>llVNkoi62M r`qOSHhwmUsH3OAK|B(b$&!qJ8`jmcLpTvJkpT threshold: + processed_articles.add((title2, link2)) + group.append((title2, link2)) - if similarity > threshold: - processed_articles.add((title2, link2)) - group.append((title2, link2)) + grouped_similar_articles.append(group) - grouped_similar_articles.append(group) - - return grouped_similar_articles + return grouped_similar_articles except psycopg2.Error as e: print(f"Error: {e}") @@ -101,7 +49,6 @@ def processing_similar(): grouped_similar_articles_result = find_and_group_similar_articles() if grouped_similar_articles_result: - for group in grouped_similar_articles_result: articles = [] @@ -112,8 +59,8 @@ def processing_similar(): article = [title, link] articles.append(article) l = len(articles) + if l == 2: - print("2") a_one = articles[0][0] a_two = articles[1][0] @@ -141,7 +88,6 @@ def processing_similar(): modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) - print(tokens) if tokens > 2000: combined_text = f"{text1} {text2}" combined_text = slice_text_at_2k_tokens(combined_text) @@ -150,7 +96,6 @@ def processing_similar(): user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." if l == 3: - print("3") a_one = articles[0][0] a_two = articles[1][0] a_three = articles[2][0] @@ -190,13 +135,82 @@ def processing_similar(): modify_similar_data(similar_d, a_three) preparing_articles(False, a_three) - print(tokens) if tokens > 2000: combined_text = f"{text1} {text2} {text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + if l == 4: + print("4") + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + a_four = articles[3][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + get_four = get_specific_data(a_four) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + text4 = get_four[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + link4 = get_four[0][2] + + if link1 != link2: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link2}, {link3}, {link4}" + else: + link = f"{link1}, {link2}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link2}, {link4}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link3}, {link4}" + else: + link = f"{link1}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link4}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + frtoks = num_tokens_from_string(text4) + + tokens = ftoks + stoks + ttoks + frtoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + modify_similar_data(similar_d, a_four) + preparing_articles(False, a_four) + + if tokens > 2000: + combined_text = f"{text1} {text2} {text3} {text4}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field" + else: + user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field." try: completion = client.chat.completions.create( model="gpt-3.5-turbo", @@ -216,16 +230,11 @@ def processing_similar(): except Exception as e: print(f"Error: {e}") - print(f"Title: {a_one}") - print(f"Answer: {generated_text}") + print(a_one) continue + else: + print("Done!.") else: print("No similar articles found.") if __name__=="__main__": processing_similar() -ready = get_ready_data() -if ready: - for a in ready: - print(f"Title: {a[0]}") - print(f"Link: {a[2]}") - print(f"Status: {a[3]}") \ No newline at end of file diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index ac86b52..e939adb 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -1,10 +1,10 @@ from bs4 import BeautifulSoup import requests from urllib.parse import urljoin -from openai import OpenAI , APIError +from openai import OpenAI import os from langchain.embeddings import OpenAIEmbeddings -from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data) +from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing ) import json from dotenv import load_dotenv import tiktoken @@ -39,7 +39,7 @@ def slice_text_at_2k_tokens(text): sliced_tokens = tokens[:max_tokens] sliced_text = encoding.decode(sliced_tokens) - + return sliced_text @@ -82,7 +82,6 @@ def get_article_links(url, already_checked): return link_store - already_checked = set() for dlink in dlinks: @@ -116,8 +115,6 @@ if __name__ == '__main__': title_text = replace_with_spaces(title_text) - - print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}") text_text = slice_text_at_2k_tokens(text_text) text_text = replace_with_spaces(str(text_text)) @@ -138,13 +135,6 @@ if __name__ == '__main__': title = response_data["title"] text = response_data["content"] - #print("*********************************") - #print(f"Title: {title}") - #print("---------------------------------") - #print(f"Content : {text}") - #print("*********************************") - - vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98): diff --git a/pyth/vectData.py b/pyth/vectData.py index 35a642c..e3deda7 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -7,7 +7,6 @@ import os from dotenv import load_dotenv from datetime import datetime ,timedelta - load_dotenv() host = os.getenv("DB_HOST") @@ -27,20 +26,20 @@ conn = psycopg2.connect( def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) v2_normalized = v2 / np.linalg.norm(v2) - similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity -def is_similar_data(title, text, link, embedding, threshold=0.98): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() +def parse_embedding_string(embedding_str): + if isinstance(embedding_str, str): + numbers = [float(num) for num in embedding_str[1:-1].split(',')] + return np.array(numbers) + elif isinstance(embedding_str, np.ndarray): + return embedding_str + else: + raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") +def is_similar_data(title, text, link, embedding, threshold=0.98): + cursor = conn.cursor() cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() @@ -54,12 +53,12 @@ def is_similar_data(title, text, link, embedding, threshold=0.98): similar_d = existing_title insert_data(title,text,link,embedding,similar_d) print(f"Similar data found: \n #{title} \n #{existing_title}") - print(f"Inserting: #{title} \n") + print(f"Inserting: #{title}") similar_d = "NO" cursor.close() return True else: - print(f"Same source of same article!") + print(f"Same article of same source!") cursor.close() return True @@ -68,13 +67,6 @@ def is_similar_data(title, text, link, embedding, threshold=0.98): return False def get_similar(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' cursor.execute(query) @@ -82,73 +74,49 @@ def get_similar(): cursor.close() return similar_data +def get_titles_links_embeddings(): + cursor = conn.cursor() + cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') + data = cursor.fetchall() + cursor.close() + + titles = [row[0] for row in data] + links = [row[1] for row in data] + embeddings = [parse_embedding_string(row[2]) for row in data] + + return titles, links, embeddings def insert_data(title, text, link, embedding, similar_d): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) c_time = datetime.now() - - cursor = conn.cursor() - cursor.execute(''' INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) VALUES (%s, %s, %s, %s, %s ,%s ,%s); ''', (title, text, link, embedding , similar_d, c_time, True)) - conn.commit() - cursor.close() def get_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) + cursor = conn.cursor() query = '''SELECT title,text,link FROM vectorsvevijesti;''' - cursor.execute(query) data = cursor.fetchall() cursor.close() return data def get_ready_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' - cursor.execute(query, ('True',)) data = cursor.fetchall() cursor.close() return data def get_source_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' - cursor.execute(query, ('False',)) data = cursor.fetchall() cursor.close() @@ -156,138 +124,60 @@ def get_source_data(): def modify_similar_data(new_value ,title): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' - cursor.execute(query, (new_value, title)) - conn.commit() def preparing_articles(new_value ,title): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' - cursor.execute(query, (new_value, title)) - conn.commit() def get_specific_data(title): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s''' + query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query, (title,)) - specific_post = cursor.fetchall() cursor.close() return specific_post + def get_all_links(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT link FROM vectorsvevijesti''' cursor.execute(query) - db_links = {link[0] for link in cursor.fetchall()} cursor.close() return db_links def delete_specific(title): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' - cursor.execute(query,(title,)) cursor.close() def cleansing(): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - day_long = datetime.now() - timedelta(days=1) - cursor = conn.cursor() - query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' cursor.execute(query,(day_long,)) - conn.commit() cursor.close() def drop_table(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() - query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' cursor.execute(query) - conn.commit() cursor.close() -def create_db(conn): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) +def create_db(): cursor = conn.cursor() - cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") - register_vector(conn) - cursor.execute(''' CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, @@ -298,10 +188,8 @@ def create_db(conn): similar_d VARCHAR, time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ready BOOLEAN - ); ''') - conn.commit() cursor.close() -create_db(conn) +create_db()