From d4e99c7c5f2227f9e4e6408f044f561e9ed51701 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Sat, 6 Jan 2024 08:17:05 +0100 Subject: [PATCH] added article.py --- pyth/__pycache__/articles.cpython-310.pyc | Bin 0 -> 5925 bytes .../scrapingsingle.cpython-310.pyc | Bin 3031 -> 6614 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 5433 -> 6045 bytes pyth/articles.py | 231 ++++++++++++++++++ pyth/scrapingsingle.py | 46 +++- pyth/templates/index.html | 1 - pyth/vectData.py | 62 ++++- pyth/web-server.py | 7 +- 8 files changed, 329 insertions(+), 18 deletions(-) create mode 100644 pyth/__pycache__/articles.cpython-310.pyc create mode 100644 pyth/articles.py diff --git a/pyth/__pycache__/articles.cpython-310.pyc b/pyth/__pycache__/articles.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40e56ebde6bae660b85d03f5e389ea869ab4e067 GIT binary patch literal 5925 zcmb7I&5s<%b?@r0>6x9G{r=+eqh(v7Mp-S*W$f6LESnKm5*=$vaFx&=h?q2|YG-@5 zr+Zl4v#TBS2t-!7Brz<*mjJSWrGXrjzaXaoxdsT3bDaX@U=RUvF_44e$oakM*$;lK zk<6g0U%h_ys_NByRliqBtyVGcE4}fz;$OaD82?U{(?185AK*zpWQM^F&LX2(yjjyk z-HfcR-Lx6;tjOuQO;^(_Dm6>s*^$>RH_P2hv!eMCw9W%`t=qN?A-MQvmcfL74;x9B8gnM8xLzE5|x%Zeg&x!JZ(OeQ$ zl;<_K4DJO@RX|ZXg-=-X zWxmMIf%A%ZMfW?6elCm4^6$mvo5uZ1%~wT(FNs&Vv1oi|d~O~vjIF`XgERXS=K?qv zPdE?%hO-S=b;D@*Psr};4W=gBNgBqYm4@9g3S^k=gEI>#r7^gyJ zMP;fdMGqavomL>TupNn1jcY|oU$#XdR40ZM0pHhkjA<_JcUxJqBjU8RDU)t1%}9ru zPoog6GVxK?3bNMf&PYhrZ{Jy4zj3RDIkf&{?I)^q^N(Ba-QM^>6L)UkEr>teSi7sn z@afKtjg5P^@7~m(*KfSP);7sxqaS&yjpuDV>2(yjkr~_U(9Dg;Y{?j!IUCiPu3MRv zn;n*09d=-Hwun_&G=_F=V=c^e33oRdwld?Ma-%Q~qK>lTM0O?h*>IG(dlk>Bs_kBF z5kH8+f#9gFo~~9-*6<5MUO>|?zjI|P>53~s8axQ%@XAe*?qo^tO1dj{!)=jf;p^QX z&4j$t+t0SH9NB-Rx38AkLDcR?K_*(ydB`GEryhz4Ho@jho5@S4v}qPcKN|S4f!;$U zBNJira)C2$eu5gexc!&rqH$<)=P|o!9IzqFtV278$uv{jWiqcF|d zNw@db*HHfrO=HFwyaoqNMN1qx`^ec=-i#8MDSZpXQL`sxg(ttm?@P8kxN$4q#bR+k z+wTc~Gm-vNI#&Gm`)TGs5WWbrEg^k4u@9lRw-WO}%3yz`QBsu<5t?Q}+!o4$gmThe z6lQXkM(k)YRT{~w%AW;MUldVls@hZCL8^}_$n@E1&%JnlJ~~LNC@SnctFS7wnPKXry!l*7_vW5{xL=yCHgwk%L(a1bhomB-j# z$sgbsruGDjJEv+kcTd%v+{*3T*|ZQmrS&fyWp5>ErtDrKQTDO5D6bc!>0Tnaa`}Te z=!(I+8*4vWd;0@F3$sWp`-rlg<&(wm-?@AHeScT9vqYW*hkx(AwYzJ+-go@(`5(x> zc;igx`^{^-vx^6U8UM|dQTMq_Y=c04`Uv_*qW zPrNUEk>8y;E=f8uhL=e-Z3^-`$CR#;% z+ZEFua%L~zg^H8}7mUUj7AoCuC%w+9Dz%e1#sbJye3RFx_(LkbMa8#KD7W30X(Clg zd?ecaOek+tWbLgWij>oil2qUbvPp~l9*Nuf0CEc(Dn}bpRgMGVxgc2jJr;ir3R5za z&CKJ!``=!v!aV%zW}Vep#gy+rHXUu zn!^g@#)j2lEg#FPn`Z9qGTG0)!)j620jR%p*s%VbhF76s92zEulf$XJl8^7eWAll8 za-=EAYI!Z6IxvUR`E+jX*!W7iwS?APKdhhB9=@BGQLodNhp*)2dxi-NXXAfhG2_|V zhH=$M|7s7i^#2K-drwJo_#z}edsbot`e4QJ{eNV}aE2L!@oa*d`OFbU>OhYReH>pu zo&gfl?>)9>G^}-b70dBuNpGCj{l=tx19kZ(6=yl5b~C-?*p@}6$^zS~kT^1xH3)mk z3OLs%qjG~D5ENIARFz=Eh*BiB;xW0^-Z2E$xGx- zD%MaaAf}`((p2yh#I(?nNxuiq2m_paVWBEw;A#;AS5L(NSh}5J+asaXLTL4r(5gI7 zy)yC4hNUiThB1bKM=N=*YZ`m{0~96>j?-Z?;Z>On*l^LTQrO@CUD_V|FA0qIWu8sKUdsrXwi5}1E zI_9#x{+Yp@d>lTExR1@eK6ElCuR`Yp-Zi`@DbD7N;S_gsPfM2}Z4ak27qRIbn9RsZ zyfmVSS3c$yl!Zv9xlC~nk^B#N6N*U)-@k zy>wzzSauP!bYR)}6Rx)O__6%AM_)+t%r(GTEMVN{inYkkJ-Z(0X%6k(|HrD(O3fXw zR6#wpQkcunPpv{ZU(j=T@rqUX#3|WHlEu1}hO>k3X65b5_Skln@=!j*XMbt(xqO!1 z9h1-h)EqA5OWQTF`*wYMe0V;aIGoJSKW4)VxrZF)0$+foFJ@DR)0%rJn?VF!#EEn% zpFVKleHLHLZH!H4s`reC9`@JbxSy=BGLhO_%-6FhZuJ1($v7dVV+95q>Q8QnZPYDrd`p zidkRAevtY<+_=5&Cvmjz?*ZETK=7SN_?JhLm!VBWd@!lI4={vPtvK}ue|J0tA1MKH zhA2w*(0cS82S?%u=y6M=;qIt^DY^;hAl~=mBz|3wNeW8dka;Bjg9Ia>38s47I6ihm zF0^{2+W}3Muo4YB_`eN&<%y;3dhOH)={JM9S;p7z{ch zl^qOcFmp@aE0D2{R`I$oZ{gFIr4I=AU-t*zjX2#C64bf%1k0jtJ^Z807v}EAe5Em` zj~r!1B9`lDFYi!6Nsg>hLCLQq04M7xfJ~zhqXYWf3NkfD`C}xqFo{!qLizwiP-<@_ zxR^*)X*6a%WMyxsNsKcW_bq90x{qmu+IrX*a$lB7dwK1nw&;<%#!KqbSqD9?N78Tm;rWg=(&`M%_4_et?qDq8?sjdm)IOIRT3-nr* zKc=bQr=FZmT*9PCN%hMuZe!`O2;%F)Kos6nde%2b<03&D-;1Q~)*$S59oIa2XV9n8gc~HO#yRTaWIFY#O3+i)2o=Xrvd83!UX30Vp`i5omb;iS(aQ zxIiWqV3WG(A>jr705UNj5y+={6(|L7;1*5eyKNH3V-sjqK?*=P1_=Q;bv6ZF(W+>7 z(vn(3n$78UTFa=1Intu_6uK&GM(bKIO}z0yqxqHlRo$gg(,DLj;b)M7 zu>YsJD}O<=-bOKSe}v^G=ikVJte+w%+F&;<`7Vjj7AfhDrtz9ay~+WO?d=!$3k&HE zwg?%obOkQh+UZI!2<2XgMWh@K>9xKjlzKZb6Zm@qqze&XGDA|{sAi%d?jSJ2c;!^` zMVAK3qeY_qpa?A;RpjzTXc54Zbp4>bEX2JP?)p?cZOfpCg)27HA{r?Mr6fTYZiG8a zIZm923PQ|hBY3*ED$-TT+;q@uvnfX>{fY%e7v-AFD(DL;eJ>D%Mo5I@Yk{RVvfo%kI#nOBFVS{|vK#YqS3ak$(M) literal 0 HcmV?d00001 diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 38af9dbd593bcd53feec35a1f82ca687699f9f61..34597ac35e738ff6dec6066b9f60a885ea55899e 100644 GIT binary patch literal 6614 zcmcIoO>7%UcJ6LAn@x(O{{Q^j{u$d`%akoyqp`>98UM)tmM0_6ddwY%X|ak@OHDRa z-I6V$nK+P>%>wbpyFijnW+##N<{W}K<(4@F_F@y{kmT6c90H#P$R-G21e1KPnw0-$ zfm}kOU%h%&_4?JTSMR62TrQ*F_g@!&%71Y{QT~-Wul~Bw`4Er)51OJdg{iJmQ%kB! zxaMjl4Q;|r)bx@rX}X)N86~5ZDy7a3gfJV=&yvtHKsKq*WwzRPwq3-6nv_klJ|GuWO5!rznQd*%3h?eP=* zMCk;d#=Jdz8Zu1tlWZT~%}=oGhW5Ggg|>lRaEqV7*a^(pNf_bH%Y1a{{iRb&-(^|0 zpS{Bl{7hp9*&#mtxxxzk1mJhRND_5HV}~DVrPKWM(g#c#RhB+nI#W8!&*JIdJjc&M zvh(~r8yi(hGiZ;Xo#5LxFUXOLJjZAF1onQB71{X1dRcYq1mt*`&XBW4W+px~kmG56!`7&@;}AsXCQ-JVzwkb<3_(U-qMP z#pQP02kWT6<}s(b-dQZCLxe8ZvssyW0k5w``5Pb2%$~VihKS{>Gq($gX7^$cH2jm} zZ|wOx6ag=@Em4M6^|8HE3KXUQJ#CxAwXICAUjbFQd`}p*y1?x$*4ZxZIkvtG003i(ljx8o`mt;?$8~Rm^*l;Rk|LWC#vIy#n{N^-BQL@tnisj{pS9 zrW)d6w6s7AwarARJcen*4Aj|z9+`o&9C*vT9`*5h1sbm}l&hX7L;2i|5-tV|Ilv1E z(F=A_G9EHJ)1N3}6!aIT#}{ccwr|hdb!YrM_rbU^?yvDR2O18XBQ@I(xEOD&2aDr= zMc56pAUrNFHrAt|`f9B#Me)m3=%pNQavSY~Hk48Gs@M*sq5^b&DIm0Ex!p@ZnA%dA zhRt@iod^?~8g06z2Z>PMP(y856~m#j040PvQ$p?cv{9v%#5_GrVxAEiP)}l3^nl;< zf5IyYNzo5n3=kM1FbEJCj6-D{mp~8ry&%eUbW*O{H6CSZ_PzFw1f4`=#M_HmS-E?m5H`O4L6*Jp2h z^cx?~-TdTxw?4gHvga#|R~HtYrDeBP_Zln04_4Ri-dk@z{lljZpZ@;SADn;o6?^v8 z(??H#{A}~t<7a>S?5m^4CMJ(hy*vHh2`hSAKoV{ej}-8Cl~B3UMZ)}nvZjbksMb}c zd=JrYMPnqO7OB>()Duvxcr?;0i?(d0mx2!>-_^mJ&=| z*2IT_4m`=U4ccpB)4;BAjTh*$>d?wQx~fYC(^--k54Bb*Oi5jtK`K;2lch!#tW1aL zP+e2SsW7=|hH26ebOox^)jKQO!sHxMI%v$Cef~?x@}g@osJX=k?1zgeWsAVUVMnTc zlZ;?d@Txe4X@yK=3cj)m-{waN#IY!~$Zf`jj~gA>!Kz;-?;z;*#WrJBojRjvC&<_% z!)_qNGBFV|OCa_}-L@;hWF6*<7{M96!f$v8v61d0NJnN{96!p?(ZEZ3f{T;HFDZ{q zOc5RruG#J?fAhFx6mD~gkL^5G$ZMBX;Yjf4IKI4U!sD9gWmN6H_TPF^{||8$GhZ`W z+8k|09wfQPMY_2L-CUuC>QA&4WE?LgC}`p!CPlgv@Y+jqd~4FDV1bCujnj89$0ybU zinvbm&2qVh*j+A*6X?Q@wZ#cM$M8%Fic=A7g@;Bk&18 z`yJ9U-q3_()u4&wgq!)B6rg=;)wdV;NekhB9f7p=;_!K-T$Ww8-E~;#n03WNGGLQz zATn!~-}NorbWb4aKG7mwc(CGB#j7KC3LLn3Go19J6jhVZi>lXno9U-Wtct*R8r&KT;JoMSuAK>KE0j{;b z=(#M?W^cS0lnF2PYLPj*oRo3Ws$kL9Z$B_95T)V&Rf0SiLtdpSfeNp!IBn1UNWJq5 zcr*oN&c9Ip#gJ-jsI7#=6B5_M1W}nTlMsCiH`;W}f@h2Agqv+TB`xVTo$!qGbBeeT zs$qImL8g}qlT{fJ>hO$&FW?4*=~GIGfQnec(96h~W@v29l(?Lk!Ax~ie@qeRm!D~^ zu2wckhS?33CE=a_J=&smUSGF)eI3TU(3i8nl^3K>L8fhJO|;v? zZQ=G|rL`m6L3_X%?+#Pgfl0;KmgG4pc``JX6PU3hOiKQS+$BjVWo3C<3Hh)V@+Bc( zXT{XFuW0^GU@oPXGD}@l*i%uD(zHJup%%sH0OH}!aOaXdqov$ZzO_qQ;g4V?8CiC_ zffA;r#dgD@1~)XsMP;)mlokVP9-^`O0bs*nnsy}TDr_XoZ1uKQ2Hmj4U&liSk5Re0 zce5|tg&4j&Ot-qi-E14|)(ZMJ2PAb*Fc@})RyZ2&VcXdbSqA?N+u7P140Y(?-gX(> zr7OWO%HV%imDWBggYk}C;l7S!vJA$^Z|_^_VzQ+&m~h1T-LuV}a};06p|#6ZK~=?b z3CN#6$BkCR4uEFv5*OTZ{Mc7D7skjb%sb?jq<@^W5SRtB#xXb@7rs_bj9ppHMztsX zVB0Z`!4Xo;M!qceWOaAF?Z;mCT{5gSd*#b*2rwzx`c^Y3JzeuJw&ZxjLz7GO;k)2z zZHT(aeFsii3L(D5LBHad=RLU7csT`-|BD~~bw~cB!Ya9Z+(`Dx+Gpkaw^t^Ny4Ct zRigCFy$WwoG%M_nx;r0gItgxeK_J6K>MuX#Xuch#s-%~45ShNz3ot`woz->=ajP<8 zLQ)d9DfB5fh*BWP?1=&{O5at)Q9nwOc^!*WRW#I_h=;< z#PxI8sp2z*;8Ct~A7cr!l-QIzN|V`esv5LeAaA3K53zk(6=hzkKFU*pC@4#b8Ra^u zw^T@+!W(PIJ3JAkJIZ5``c{INCDxq}SAU7^`(FZ>sL*vSlR%yR#^2Ni)VvnM9<4J< z6%?i4rrLuii>h%*HMEgL+;&E3b_O#HtuGO`8fN3cYV8~2n|#W^SAjT<$0rn|^_0-l z-_$Z15`BC<7(m;j>i2svnpMsF+Be!4`u~WZLFj@iFH_-Kl$iIA$F=9YI-dqnP?q1aE*11S zAnFRd!0vp7%vX@^LNpixxlLL12r81IBqO#_26wvAu8@t@2JQ=wx=`G0aWL6@dpHda zZ4^ZeeHTIzLq;j6#1}IJ=!1f|K;R4j?h4%-UyM;FBRjD{$_s*zq;yXtMNs`AWTmh| zm>}>c1bhMk0ZMhnpAxuB;HLy+4!lm-pAq-G90331{$35lIA;v^0s38ZxmsA{09T8W}opl(Gi$MfW@S?@Zt z8$PT`MhcgzLIUlPA{lO+`xo@yTw8VihN_nyka|FUvknFI!mQ@^=DqpNn|UAF-?o0w zW!!YyAUJ*~{RlTwPcqZ=izln)9wnS`X2;Dq%9yRU6J{c`X|~Z$n#ph-w^Qx3nQmvy z%n5(U974Ws54T6m5lUd>h##ixH_{4}3|m8XQy!Wv@XSxi}* zj29mUdIIGF15BL1wz{tg9q|SNp$hcfgV++cdZ8^6+8C60W z3Tt=j_946^q8NN0+Z`Qgf?ul(-)h)U3(1o~GFhVY!TIEcS`mE+1tlig`i6{JKD0x_ z0Or&kiK+*F!)f$m+>*2q94D`DEhu8l zw|pD^@7bu@mN#%+LDt{O^#l={!Sz((+8QdNtl%s{i3#OMWEK4PY@r?~HW4nW#TG(0 z-R$_~t@ZO||4{6?QuFS67*sd-CACIN!C$FYJq$M1b~)HmZ-eL6o8So*n%Gh2i!1LN z-T2okno6NtEWdAbjZa+12gff|3eDqR)!6>?a*4`>zc0YscWvGsyCQ@u3YJq)S+>Eq zT&GefU6heMZ(FFOh~c1;n-BJKQ;RWF<*Z$InvH!-{ra7-Z9Hrp@YXxcff{_D%U_kz zmghQB<1XHRMIWvZhHc@?qygWvM8lhHGU5u@O=wbXQ29_vB;>+aQ z;PDv^0d*gw@zyBgjeXbZHNcbcld^>clbI$sAdt;ZF0kD?_hcFq(s3ORYP{~(!vf1# zM_3qeG7PwOa0ueQ)CMV&i7WRS&{4UU^xaMkn@#nZ)T?c`#}93|L{as+2q~h|6n+C7 z4yDEnrZJ7`Os85zNo`nX28~jKvIiO5)tNezVOg5O$&km8HBQ-U?R6%u_I1=V=uJ%{ VlEroO!7ySfLG=ghb@Y!Y`xiO0xPAZt diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc index 4104298addadbf5270d81b2b0903869d4f76c93f..ad5b7d5761068ede0ccc9fe7203405c2138f4c53 100644 GIT binary patch delta 834 zcmYjP%}*0S6z|(^yR%EX?b2GL)LOn-sFD~CAZX;HLJ+CY8VI1IVON3yOQt0eg%Cdy zPiXKZ#tTMna>Mjwj0gP-h(P|=FZCBgD2LKKWQV#P}p$k@A zkj4-00X3DpI^3P8H#gB|G-IE#qKV|)v~t6^t+dIO^M-;RR<6o}+)KxI9^+}2R^l;k zXRJlM&l)_jcx}#X*J3o%_?kU}5o}W40HHIsZb>b z3}oQl!Yl1vplTspbq8uJ=Y|B1C3vZBIF_AcwG9XDokabR!|z^vTM~91FnFnkaKA(e z7k9C9vD#d_2|IF{FWCGn5$2t^s|SMmw4I+U#!On7d}f(Ed-z=qX`Uk@y@E&;?;+1o zA)XK$*l}IV%;x?}7YvJRx$FX;vWw|fMWH?c{b*<;r=ut*p=G5_4gJO$lKxfeseC~p zdm#c@lHh2%|1d?C{1UY)DgO5HOB7)`4;$W!x3$Qj*mbQ;;RB-1WuYosI?wI-44=6* zJ(=P8nWSCbSNuOE8F0af!2~`t)*)GFDXoJ@ym&G9#mU4rIn4hM>N@xg;EIJ(H!-|3F! zAl`X;iel$_(`w`z$pb*y#=7D@8gAip5F2zES`n)?Z?Uqt;L)@2nXa+@HKSFhwLqHw zho=iWp`Hq5FL)H`DXifmL+ZA|If1Hg^78HhRm2Unq0ydy*=WmLx4p fs!FO!s*{^8!lQb6HW3pk5fe5eCZZxKY{R(&ft+L_ diff --git a/pyth/articles.py b/pyth/articles.py new file mode 100644 index 0000000..346a917 --- /dev/null +++ b/pyth/articles.py @@ -0,0 +1,231 @@ +import psycopg2 +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity +from dotenv import load_dotenv +import os +from openai import OpenAI , APIError +from langchain.embeddings import OpenAIEmbeddings +from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data +import tiktoken +from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens +import json + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + +print(f"Checking for similar!") + +host = os.getenv("DB_HOST") +port = os.getenv("DB_PORT") +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +dbname = os.getenv("DB_NAME") + +def calculate_cosine_similarity(v1, v2): + v1_normalized = v1 / np.linalg.norm(v1) + v2_normalized = v2 / np.linalg.norm(v2) + + similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] + return similarity + +def parse_embedding_string(embedding_str): + if isinstance(embedding_str, str): + numbers = [float(num) for num in embedding_str[1:-1].split(',')] + return np.array(numbers) + elif isinstance(embedding_str, np.ndarray): + return embedding_str + else: + raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") + + +def get_titles_links_embeddings(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') + data = cursor.fetchall() + cursor.close() + + titles = [row[0] for row in data] + links = [row[1] for row in data] + embeddings = [parse_embedding_string(row[2]) for row in data] + + return titles, links, embeddings + +def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): + try: + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + with conn, conn.cursor() as cursor: + titles, links, embeddings = get_titles_links_embeddings() + + processed_articles = set() + grouped_similar_articles = [] + + for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): + if (title1, link1) not in processed_articles: + processed_articles.add((title1, link1)) + group = [(title1, link1)] + + for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): + if i != j and (title2, link2) not in processed_articles: + similarity = calculate_cosine_similarity(embedding1, embedding2) + + if similarity > threshold: + processed_articles.add((title2, link2)) + group.append((title2, link2)) + + grouped_similar_articles.append(group) + + return grouped_similar_articles + + except psycopg2.Error as e: + print(f"Error: {e}") + return [] + +def processing_similar(): + grouped_similar_articles_result = find_and_group_similar_articles() + + if grouped_similar_articles_result: + + for group in grouped_similar_articles_result: + articles = [] + + if len(group) > 1: + for article_tuple in group: + if len(article_tuple) >= 2: + title, link = article_tuple[:2] + article = [title, link] + articles.append(article) + l = len(articles) + if l == 2: + print("2") + a_one = articles[0][0] + a_two = articles[1][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + + text1 = get_one[0][1] + text2 = get_two[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + if link1 != link2: + link = f"{link1}, {link2}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + tokens = ftoks + stoks + + similar_d = f"C: {a_one}, {a_two}" + + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + print(tokens) + if tokens > 2000: + combined_text = f"{text1} {text2}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + + if l == 3: + print("3") + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + if link1 != link2: + if link2 != link3: + link = f"{link1}, {link2}, {link3}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + link = f"{link1}, {link3}" + else: + link = link1 + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + tokens = ftoks + stoks + ttoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + print(tokens) + if tokens > 2000: + combined_text = f"{text1} {text2} {text3}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ]) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + title = a_one + text = response_data["content"] + vector = embeddings.embed_query(generated_text) + + insert_data(title, text, link, vector, similar_d) + print(f"Inserting combined: {title}") + + except Exception as e: + print(f"Error: {e}") + print(f"Title: {a_one}") + print(f"Answer: {generated_text}") + continue + else: + print("No similar articles found.") +if __name__=="__main__": + processing_similar() +ready = get_ready_data() +if ready: + for a in ready: + print(f"Title: {a[0]}") + print(f"Link: {a[2]}") + print(f"Status: {a[3]}") \ No newline at end of file diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index 8e65beb..44ff2eb 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -4,7 +4,7 @@ from urllib.parse import urljoin from openai import OpenAI , APIError import os from langchain.embeddings import OpenAIEmbeddings -from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data) +from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data) import json from dotenv import load_dotenv import tiktoken @@ -48,6 +48,19 @@ def replace_with_spaces(text): cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) return cleaned_text + +def fix_links(links_set): + modified_links = set() + + for link in links_set: + if "www" in link: + modified_link = link.replace("www.", "") + modified_links.add(modified_link) + else: + modified_links.add(link) + + return modified_links + total_links = set() collected_news = set() @@ -78,13 +91,13 @@ for dlink in dlinks: total_links.update(temp_links) final_links = {item for item in total_links if item} -i = 0 db_links = set(get_all_links()) new_links = final_links - db_links final_links = new_links +final_links = set(final_links) - +final_links = fix_links(final_links) if __name__ == '__main__': @@ -142,6 +155,7 @@ if __name__ == '__main__': print(f"Error in completion: {e}") continue + def comb_similar(): print("Checking similar") @@ -185,12 +199,17 @@ def comb_similar(): combined_text = f"{text1}{text2}{text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" - link = f"{link1} {link2} {link3}" + if link1 != link2 and link1 != link3 and link2 != link3: + link = f"{link1} {link2} {link3}" + else: + link = link1 else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - link = f"{link1} {link2} {link3}" - + if link1 != link2 and link1 != link3 and link2 != link3: + link = f"{link1} {link2} {link3}" + else: + link = link1 else: ftcheck = num_tokens_from_string(f_text) stcheck = num_tokens_from_string(s_text) @@ -198,12 +217,17 @@ def comb_similar(): if fscomb <2000: combined_text = f"{f_text}{s_text}" user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" - link = f"{link_f} {link_s}" + if link_f != link_s: + link = f"{link_f} {link_s}" + else: + link = link_f else: user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - link = f"{link_f} {link_s}" - + if link_f != link_s: + link = f"{link_f} {link_s}" + else: + link = link_f try: completion = client.chat.completions.create( model="gpt-3.5-turbo", @@ -213,7 +237,6 @@ def comb_similar(): ] ) generated_text = completion.choices[0].message.content - generated_text = generated_text if similar_article: if f_title == s_title: @@ -222,6 +245,7 @@ def comb_similar(): similar_article.remove(sa) print("Modified") else: + print(f"First: {f_title}") print(f"Second: {s_title}") modify_similar_data(first_t,"SOURCE") modify_similar_data(second_t,"SOURCE") @@ -243,5 +267,3 @@ def comb_similar(): except Exception as e: print(f"Error in completion: {e}") continue - -comb_similar() \ No newline at end of file diff --git a/pyth/templates/index.html b/pyth/templates/index.html index 9b156d8..c9e51c1 100644 --- a/pyth/templates/index.html +++ b/pyth/templates/index.html @@ -18,6 +18,5 @@ Second - \ No newline at end of file diff --git a/pyth/vectData.py b/pyth/vectData.py index e99883a..35a642c 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -83,6 +83,7 @@ def get_similar(): return similar_data + def insert_data(title, text, link, embedding, similar_d): conn = psycopg2.connect( host=host, @@ -97,9 +98,9 @@ def insert_data(title, text, link, embedding, similar_d): cursor = conn.cursor() cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time) - VALUES (%s, %s, %s, %s, %s ,%s); - ''', (title, text, link, embedding , similar_d, c_time)) + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) + VALUES (%s, %s, %s, %s, %s ,%s ,%s); + ''', (title, text, link, embedding , similar_d, c_time, True)) conn.commit() @@ -121,6 +122,39 @@ def get_data(): cursor.close() return data +def get_ready_data(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + + cursor.execute(query, ('True',)) + data = cursor.fetchall() + cursor.close() + return data + +def get_source_data(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + + cursor.execute(query, ('False',)) + data = cursor.fetchall() + cursor.close() + return data + + def modify_similar_data(new_value ,title): conn = psycopg2.connect( @@ -138,6 +172,24 @@ def modify_similar_data(new_value ,title): conn.commit() + +def preparing_articles(new_value ,title): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' + + cursor.execute(query, (new_value, title)) + + conn.commit() + def get_specific_data(title): conn = psycopg2.connect( host=host, @@ -244,7 +296,9 @@ def create_db(conn): link VARCHAR, embedding vector(1536), similar_d VARCHAR, - time TIMESTAMP DEFAULT CURRENT_TIMESTAMP + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ready BOOLEAN + ); ''') diff --git a/pyth/web-server.py b/pyth/web-server.py index ae78c2b..ed1dc44 100644 --- a/pyth/web-server.py +++ b/pyth/web-server.py @@ -1,5 +1,5 @@ from flask import Flask , render_template , jsonify -from vectData import get_data +from vectData import get_ready_data from flask_cors import CORS @@ -21,4 +21,9 @@ def articleone(): def articletwo(): return render_template("two.html") +@app.route('/data/get/news', methods=['GET']) +def takenews(): + data = get_ready_data() + return jsonify(data) + app.run(debug=True) \ No newline at end of file