From ae1c1902dae5dea7eddbbce8f61457cf68d29472 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Tue, 2 Jan 2024 15:00:07 +0100 Subject: [PATCH 1/5] Combine similar article --- pyth/.env | 7 + pyth/.gitlab-ci.yml | 21 ++ .../scrapingsingle.cpython-310.pyc | Bin 0 -> 3031 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 0 -> 5433 bytes pyth/requirements.txt | 141 +++++++++++++ pyth/scrapingsingle.py | 186 +++++++++++++++-- pyth/templates/index.html | 23 +++ pyth/templates/one.html | 12 ++ pyth/templates/two.html | 12 ++ .../test_scrapingsingle.cpython-310.pyc | Bin 0 -> 2416 bytes .../__pycache__/test_vectData.cpython-310.pyc | Bin 0 -> 2808 bytes pyth/tests/test_scrapingsingle.py | 60 ++++++ pyth/tests/test_vectData.py | 89 ++++++++ pyth/vectData.py | 190 +++++++++++++++--- pyth/web-server.py | 24 +++ 15 files changed, 726 insertions(+), 39 deletions(-) create mode 100644 pyth/.env create mode 100644 pyth/.gitlab-ci.yml create mode 100644 pyth/__pycache__/scrapingsingle.cpython-310.pyc create mode 100644 pyth/__pycache__/vectData.cpython-310.pyc create mode 100644 pyth/requirements.txt create mode 100644 pyth/templates/index.html create mode 100644 pyth/templates/one.html create mode 100644 pyth/templates/two.html create mode 100644 pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc create mode 100644 pyth/tests/__pycache__/test_vectData.cpython-310.pyc create mode 100644 pyth/tests/test_scrapingsingle.py create mode 100644 pyth/tests/test_vectData.py create mode 100644 pyth/web-server.py diff --git a/pyth/.env b/pyth/.env new file mode 100644 index 0000000..c213e8f --- /dev/null +++ b/pyth/.env @@ -0,0 +1,7 @@ +OPENAI_API_KEY = "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" + +DB_HOST =localhost +DB_PORT =5432 +DB_USER =postgres +DB_PASSWORD =salmonela pljusti 221 hamo +DB_NAME =svevijestiweb \ No newline at end of file diff --git a/pyth/.gitlab-ci.yml b/pyth/.gitlab-ci.yml new file mode 100644 index 0000000..8cd8989 --- /dev/null +++ b/pyth/.gitlab-ci.yml @@ -0,0 +1,21 @@ +stages: + - test + +variables: + +before_script: + - pip install -r requirements.txt + +test_file1: + stage: test + script: + - python -m pytest tests/test_scrapingsingle.py + only: + - master + +test_file2: + stage: test + script: + - python -m pytest tests/test_vectData.py + only: + - master diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38af9dbd593bcd53feec35a1f82ca687699f9f61 GIT binary patch literal 3031 zcmai0TW=f372aJgmn%x7BuBo;*O6^mqAiiOWLb`^xKV7ub$mtT#_<*|vEm$&OU+(1 zGn6fY0u0D6L1Dlt(5Ey2oj3oJKD0le^V+8XeQTeB!swYLWg97qEHSq;XU?2C^PO)P z7Yca;zi&!^fUkxO;~#qP@i&Nr&#{QRmSIqXn#>4I&op)4VwPuNn_-#I_Uu$zxLxdj?L! z;Ww6d7ECzXoTLZP&N(>OokBl8`N2f*rs-I>L=VzKXlYtoLfLt(K@Zn;|3W%{B%Qy| zpPy;}9>$up&2qCs$LUdej2{29MNiO^aQ-`ko`M;y$@f_;ty^^BmE~Q8i_J^a7&n@8 z&3W%K40jjcGFra^SLo?+!&}7m47QVSm7d+h(_uRai{BaFTm8|wemez|A4z+<2fLr9 z)|Q|Mi!h02UZT^q^eX$>6b=;jWqWwnkBzOiZNRnH=FWs^c*~~IF5q70VR_%`W8(>8 z#9RK|@>aXoJ~N-8WGyYF3qNX{l}uIm0{oT?Hd<^gZYB3ol526+jDrZf&VvM^mD|`G z>Gf8_b)Yne>H=l%!&_fLO~$-rslgx;z-5*C(pSTQsEQyAn9oxI>JGBlr&StDh_+Sn z!Nb-2E4QmF4{ulRtUf7a+QSVg6R}jOY_i~adEIXx+7}3(h)q9;HsW$E4%=h<@^#h( z;cu4pOgsNu+T zxvO83FIoYkmAm(zTweUTL{^f7!DCpz6UfTK{9<|j3Yot1>qqzQ&JY%C0=WgX&A3Ev zG?rVtH<1?n?RUJh>OI2MoxXzv1(szqp1;`~K63?qIorMZCmH8eOwvNb8NY zYjvz$#&fiCa-C9%q~jPgqz$t+Q}jej5VX=-v9A0=*v%o z4H>d>;&Y6n%K2OdH3rJ`AK}&64GtT87}ZL7<#O0+fsjIF>L69F0X_vT)S!^QY>8?u z#-Tp9vL!s1%H0SeTJ;&@87*}D1QA75#-|jmWe{p=*k>HA>{M$Fj4x2Vhe%??ld=V3 zQn`J9gvzrZ+N=tM9Pom+m(>EHY~A!{w|&-vw+5fU`Jb*=8v3ey;jjBqP`L?WQ^rX} zY{Pbd_>sYE=nDzFlI+MvMbx;TXmDd;P)>H#C?2xf-@cmebrj7Cg4K|jH+8X1*D9K> zsY@Pz)(h)DcGmtkKZ8m&6RZ7Chebyrg}@kg8b-%>>g<{wlUp70kJc7uXXipgde2*G%i#F%K{|kLvsBR4u!TnX7F=*%B@yI%$I7F7 zJj$Y$_UKMai=x!k`}*p0$^Du{MF!h`B#9;-9{MtfBeH>nKx6~RM$B0J4B>;*NlPw~M>;M`r0vd- zM;I^c{i#fT17W-kxcLr=;%GL~4xrf(x6}5036#hm4PvZ3^GBkM>$Of58mrkWir~0RxA{zlPD$027Y>2tblpBH& zejNnY(Bg|auoLHh>1*fqPdYLCOJ6&GqZeDUMCyP0tN#A^D}>j8M5Ymqn9=sZRnB9M zK_nXF40M#2Na-_`Sr?a6e&HkNm>NO?VZnNzxpZ@S{HR`QU`~|b z{qw~&?A56cQ`_lFmQsr-C-vTtYsOZt9%u~l91gjwWp-1;Qg9)q#Wh{NxBNkPxA$V{npjC~Clt1{m*3G61RUW~d^p%}f zf>#Y{5c&1J1x)?O4*^sE6fWbVy3%~rOPyo7cMvP(q)0&8)XzMc)fi3F8S@x%Ch#V( z=}Byi96I{p8U&sV`5O1me-L82G~%EJLgo7KNr9t^_#^|@fbQGZe*=Un>@$4&_aFtD z$|gL(fDNVrtKuWWca+n+1~styyatH|figd48O`xMLUflAy)6dUHQ|ajRzrr1{5*uL ztm~1qb*#MQSPn9_YdRU-IwP*7Ib8F05ocXXYZt9Ca|nxtdKPLOGOc&cyJAkyyC3s? VW(>LCeldbI?)1WXXaC){{tKS$N(uk~ literal 0 HcmV?d00001 diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4104298addadbf5270d81b2b0903869d4f76c93f GIT binary patch literal 5433 zcmbVQTW{OQ6()zIL|uJ}H?ebNHa81Z>?Vu0+a#N<6Um6Xv1~|6c30S~Kx@V}6G~Ku zv=bZjlk*xB*w;k>`!Nsw2L%fB7xbBLMIYLrK>N`4rA2|BGnA-{<%LrUbK%VJ_MPuL z=Zu_8CMm)3yT5$L{(44|{)8v}M;uOm0mYq^C5cK@wxt@j#Fm$RPSZ(x3P#S* zDf$t#)ATew1MLi*rZdotRCu9OR6 zy0oxde5VEKrsH~n+IBe9TQ1{)-ZowLrNe0uqiZd*$r=P-xpQdHs!$$6aqmFoOP=&h zJ|MnyD4&z|h%a}}WpS=}icdCVU)hlN36*C6g&Ap2^;N)vlsJ$?%BcZqwS$;#wM=^> zP+Jafa(qTk3&{2@D7S;m_N_XOo3^#fD4gEzKTYS z7nu9pbK3LnHruwIG1s$hHcdo*zP;mZ&Li{%(=+GVJHfezX*afPfVY0c4tQ%}P&)@v|_y3yx=I@_35S{Lkg$}AgAoVxsATA#96i@XuFGl4z+MecX2k}E` zFXqP%bV?3^CD2R2NZgMXq;FUEbYG_mAeQiw2Px>QGl26!8dfCXOoN$}m+_OnvQPHX ze%e=`tK9O5pL(wFU-^l>jFbsZ&hR_jOk)5@m3K` zP3~pjJ;o0~Wx(511y`hTW^nYyO!vySq<@xXz`EHI&%jsj-mVlM7avv)&+=@RXU#Q6 zsny!ZgSnp@i{+JNBP4#$GfKzh?yceYyej*&F3?5d(pFSUTU z9^CW}j1eOlkkU$Z_k0D&Yq-u9Z!p7IH(We5VO_&!mltyrz;t(~;j}ky2l0l}Y5@m0 z8jX)(HICH;R+CT#vBnm69UjElE7sWZSfH;nud!*`Ha~^)n#sAj6U5hnI3Q8Miy&o* z7pvoI2U?q3Eicd-w&Su~HqZnhf$FhWAU#AQNJQv>sy8`vHyxV>D!v@QicdMkUV*QG z53NTl-C+9YVBOnl+bqaNGvTu)qx0zUAQO$@EAX@Um_$cUbt=RZR_AeM%yPSKNqp7E zFzDWgN|j0Ui~UEBCy6fWP$p$vPD0UT6(lnSMTLG$%*AB#&ukWF_@{7-29)SP{dd*# z4}8D~^?>gY>X9iyzjL7bp@L+gAO%K}$Y5UqHYy!9J{@3V^qr%LXngsoSS}j9RX{6a zsbtLFD6QPc8#k6$3QLPi#lnr;yA($#_!X>OPB#hL3Snnr|&aqU27=+MOSB|LNm<>IHfkMa|$ilo4 z@iX5;KMyXX$#T(6pDsLpQmh!W*WA1@Sd9ENH+NSo%w9qiC@tq@&qSbjr`fc; z&`LVb!xDZIt2wOZvHBTSXg*k6$CnE)$0*U$49~&oQz$MfXxLJ^5+hkf{jaX2iCB58 z3i!|A@`%Sb2Y4)?`yq#lk-DF1{4NXxDmtcNg6T*woUmcZaqL2Mqi!F-nj`r^SGVpx zm>t0`;uE1;1$3+W3g}ivW~~#!D1wX|V$!`QtA&MX@hEpH#cGcr#%I8Tj%Ed?!hu}! zBosgahT8lbdXRb~TI^+g+q6NF!uF9&1$M)zooYJNTHom^Qvm40joi~vsbtVOs$`si zxeGAWz$n)}Fnx%?Luf~UCxctVlz{+vkXiwFiZ89pJ?H>Z(s6v4WOW@MWPyEQctC8C zyXn*24?4sixLgoR8--Aiaoen6ty_(H8*#jh)y@3xthHqIBFMx08@s3r9 z`)D*|$HWP$Q1vzN4*6D)_GN&xKVoQ|h;g|C-0|`nupxnWz~EJZ;!Z)e36K9awg+~j z(v0*Sc|pJ`vPf|RCXmdNxgUpKWzcg3iF+<28BVi({~g{9Z#^aL8i*Vu9(8;pckZ6; zK*s9cpNkfUM(GPIq1tu27G5%Jip;o-;}=KFMNa%b%pDT_ z%>pnML_ab|JRMQTlBC>e*FAI1W`96nrlA^29dwmIVvVq@AT0fH1RX^L z%VRhhMYwMA3;=Yto?gNSHwZ}`jwoU?DkvyMM4Bf+Qag_E+QV`YgL3h!YOz#VS}7q^ zm?00qWqty07UgW6OCn^CEbr|CjoIib%pxsntXUhd&$CS1SS>FtFOFBp;290E>@}w%d1AA zxVZ4-an*SEq+BkRs`Y3l`iySknm&mbuYz&ItOcdeCqdFju;HlE;0*uWplg4GYeu?y z*u4=bX&@Kw5FlJ##O8ANN=UC@qP-FB+vmEwXpXNCOuNrP(!4d-WFEH~?p)in_)7~6 zhi)*?T3gNbPPk_#jtl=WiGQ>>raoM_79?&EgOCJ&bREByx`OLPN{0Wyi2bA3>WCa# zq@^Og5($F{SHdj|4B+c$RUg++Lz#dwu8-+c`b7Ht{{ZQGt int: + encoding = tiktoken.encoding_for_model(model) + return len(encoding.encode(string)) + +def slice_text_at_2k_tokens(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 2000 + + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + + if len(tokens) <= max_tokens: + return [text] + + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + + return sliced_text + + +def replace_with_spaces(text): + allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " + cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) + return cleaned_text + total_links = set() collected_news = set() + def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: @@ -36,6 +68,8 @@ def get_article_links(url, already_checked): already_checked.add(link_value) return link_store + + already_checked = set() for dlink in dlinks: @@ -44,8 +78,17 @@ for dlink in dlinks: total_links.update(temp_links) final_links = {item for item in total_links if item} +i = 0 -for link in final_links: +db_links = set(get_all_links()) +new_links = final_links - db_links +final_links = new_links + + + +if __name__ == '__main__': + + for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') @@ -54,6 +97,16 @@ for link in final_links: texts = soup.find_all(['p']) text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + text_text = text_text + title_text = title_text + + title_text = replace_with_spaces(title_text) + + + print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}") + text_text = slice_text_at_2k_tokens(text_text) + text_text = replace_with_spaces(str(text_text)) try: completion = client.chat.completions.create( @@ -65,23 +118,130 @@ for link in final_links: ) generated_text = completion.choices[0].message.content + generated_text = generated_text + response_data = json.loads(generated_text) title = response_data["title"] text = response_data["content"] - print("*********************************") - print(f"Title: {title}") - print("---------------------------------") - print(f"Content : {text}") - print("*********************************") + #print("*********************************") + #print(f"Title: {title}") + #print("---------------------------------") + #print(f"Content : {text}") + #print("*********************************") vector = embeddings.embed_query(generated_text) - - if not is_similar_data(title, text, link, vector, threshold=0.9): - insert_data(title, text, link, vector) + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector,similar_d) + except Exception as e: print(f"Error in completion: {e}") continue + +def comb_similar(): + + print("Checking similar") + similar_article = get_similar() + + grouped_data = {} + + + for sa in similar_article: + if similar_article: + first_t = get_specific_data(sa[0]) + second_t = get_specific_data(sa[1]) + link_f = first_t[0][2] + link_s = second_t[0][2] + f_text = first_t[0][1] + s_text = second_t[0][1] + f_title = first_t[0][0] + s_title = second_t[0][0] + + if f_title in grouped_data: + grouped_data[f_title].append((f_text, link_f)) + else: + grouped_data[f_title] = [(f_text, link_f)] + + if s_title in grouped_data: + grouped_data[s_title].append((s_text, link_s)) + else: + grouped_data[s_title] = [(s_text, link_s)] + + for title, tuples in grouped_data.items(): + if len(tuples) == 3: + text1, link1 = tuples[0] + text2, link2 = tuples[1] + text3, link3 = tuples[2] + + t1check = num_tokens_from_string(text1) + t2check = num_tokens_from_string(text2) + t3check = num_tokens_from_string(text3) + slice_if_more = t1check,t2check,t3check + if slice_if_more < 2000: + combined_text = f"{text1}{text2}{text3}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" + link = f"{link1} {link2} {link3}" + + else: + user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." + link = f"{link1} {link2} {link3}" + + else: + ftcheck = num_tokens_from_string(f_text) + stcheck = num_tokens_from_string(s_text) + fscomb = ftcheck + stcheck + if fscomb <2000: + combined_text = f"{f_text}{s_text}" + user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" + link = f"{link_f} {link_s}" + + else: + user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." + link = f"{link_f} {link_s}" + + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ] + ) + generated_text = completion.choices[0].message.content + generated_text = generated_text + + if similar_article: + if f_title == s_title: + print(f_title) + modify_similar_data(first_t,"SOURCE") + similar_article.remove(sa) + print("Modified") + else: + print(f"Second: {s_title}") + modify_similar_data(first_t,"SOURCE") + modify_similar_data(second_t,"SOURCE") + similar_article.remove(sa) + print("Modified") + else: + print("Similar list is empty") + + response_data = json.loads(generated_text) + title = f_title + text = response_data["content"] + + vector = embeddings.embed_query(generated_text) + + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector, similar_d) + + except Exception as e: + print(f"Error in completion: {e}") + continue + +comb_similar() \ No newline at end of file diff --git a/pyth/templates/index.html b/pyth/templates/index.html new file mode 100644 index 0000000..9b156d8 --- /dev/null +++ b/pyth/templates/index.html @@ -0,0 +1,23 @@ + + + + + + Test Pyth + + +
+
+

Test Title 1

+

Test Text 1

+ First +
+
+

Test Title 2

+

Test Text 2

+ Second +
+
+ + + \ No newline at end of file diff --git a/pyth/templates/one.html b/pyth/templates/one.html new file mode 100644 index 0000000..bcba718 --- /dev/null +++ b/pyth/templates/one.html @@ -0,0 +1,12 @@ + + + + + + Article + + +

Test Title

+

Test Text

+ + \ No newline at end of file diff --git a/pyth/templates/two.html b/pyth/templates/two.html new file mode 100644 index 0000000..bcba718 --- /dev/null +++ b/pyth/templates/two.html @@ -0,0 +1,12 @@ + + + + + + Article + + +

Test Title

+

Test Text

+ + \ No newline at end of file diff --git a/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc b/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab3b6cea01b3884563720a5ef05001b86732ad46 GIT binary patch literal 2416 zcmZ`4%Z?jGaC&Ty$K%I7vv~l?aDZS)>`e$DMNvdZR?&jAiP#8aMPL7wvZ7!1vpG-}9Tk zq{*PVnZT z6Rq(&z?1wGz|$6(L0Bg=qw}KPPhFRSmJDTiK&%TCqqEm zUvN6s>F!u;7vmBLi+4(%-`L#9b~s~cK2SjO@814~C%TY;dow;@O;J!5vqJNHuUj!o z1FmB#b(#nsi!^_rOf$_DmpW!tQ?rn&SfyDisGJcxckXU%-Pnw8+}(`7+W2;|Hq^Sj z+V5Yw{8?`u{!3RsSzll8C#dtQp9_o6VW?FA+=&QtB9{^7K6W4$6|l2v7VdGSH*?Jg zlIpa`69RjWzYZW6`kOFR037XTqTNIHfJ~jEimn1q7u9(6_w ztib{d+0QO(v(B{klfxFKb%2Y%)Q~!G+0qmDz@Pe~MYep}87+;L4;mU$VhSm75KIHM za@sj)!n_J|OP@Gevz#_qbdsHzHqjc3uT5KX94)OKTl#Q;IpDgJTYsT{C(WVGM6aY0 z0tmr%hnHbog7NWl629h5++KsR4dd@q00wZ+eII;DMz%zrkRRNK?zZ#Hy$4xSQ(C6w zGe|1T7tO!ZUDvq1byL%+7W|nbPlJr-oBd&t@jg{_hvsSj7FQ2+QTEjy-%Cdj+w@{a zmFBWv?(1P+14UUFt3=Wg^A?8Sy>j0K*F>sxQe@@z_fSVv_>?ob@wp{!Fr@rn=*;0$hf3&016ybWN|9+djx$GuN3 z>apA@jHmX&v~2RxEvTE2=2YxMRb2?bEXEQRpbHRLxWyl-Q1Y^n;0JJGRThGqdQyN1 zxi-}?gl=+u11_hDhK+*n(OidVzFWx5RzKVU!h!moK&>~Xk!Kg^5#VvY)c*bJ^i`BZZj>cR?4#!?~~jkns`=w7gBDT}?7 z@!3AXX%5G*MTAs^H2iY=&Q@6LjG*C!k5WAh&)cKt?a}jd>&}O}DHlxj!0uXBFfNRr zai!>hD~Yu(F%#vx=-mJVJu_(1nBkyQ_mqUD?--BGdes6Il2f)HCqte*;LJ3s!sfN{ za7=}%2%ej!E;JSM-sVgGaExhZDv-#sg4dlzf&OlqvzQ8Ds-q&!jhBej#v>V0ZJL;e zf@4ooz=A6^#%x%m30I~*x4|k7=T(uL8n#(w0{(ADHcgAiaKGF){!Gu!yxTEV#l^0v zqD2N2R3+TPfeNkNc-D;|$j-L%pyt36$~17AQG%EHna+Uc9Qr3`GhX{xqV3-^4_lMH zJ^MPmk>qCa&5FWgSL*-*vPgV4AT82yS4j(o@47J7i2K5O(WwWnPka)%Ew@FMN%hYO zB;jlQ{6EFI1x??N}M{kG}WE6ukD7zt9}o_c-1cpq0VLkDTF~_ddSG#n@P2!S9dTzwtk-mh~4d zj{h83Jb)orK`@Jv)asTq=~A<{ySAB~t^+eo-OTHHS*2Ua{H~u>yVWe{2E_WoVm5PL zSjd>DB=ItZHzb1+4bU>W;BG8-uyQ8f+ZqaW=sw zVV+=9>=MkAY)#n(ink|NiY`R~F<3roueJXu&Q!PBMoqOQTCCl8mF zo-HiSo5!7pkJ~W;Hx7P?D=_9@$PEz68WHqsXob{Z@C>kRF!%=8F&Ml9>>3RI0rpg7 z!yj536ow7Mf#JgNnD;x(VO0gNs_qcDMGJhXbc7FI6cACCzyK396m<}Lu6EW7skGB8 z1js&Inf7~;lrIXww8vKRDC4b~riIiVTq4gmMFUTdqnJQ3iDC)_9F$ve(OdgeSK=bi zc?=p62oIj_5pBn*+_S`0Sp7XWyIy2`Hj>dwlqa+ETs~JtZ&q&d&13_ZPwr%qR9wvV zw$%Epf*omOxCv_KA;4yOTRMONE(;X{op4U%beTdGErOM4b&f zxN+Ky6K|fekq=&3oF5M3k>_ zfxSVvAao7V0V;3%mwuGCJb_oE-BP8aP@98_bd_f-oUtTd(+*bb(=^>MpWmMTlaSvbf1eg6(oo2IHvU##e0xyUC zGYG9)94ov37NgiR@ot-+~VJLGDZEL>riz61-4nB AbN~PV literal 0 HcmV?d00001 diff --git a/pyth/tests/test_scrapingsingle.py b/pyth/tests/test_scrapingsingle.py new file mode 100644 index 0000000..5afcfda --- /dev/null +++ b/pyth/tests/test_scrapingsingle.py @@ -0,0 +1,60 @@ +import unittest +from unittest.mock import patch +import requests +from bs4 import BeautifulSoup +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores.pgvector import PGVector +from openai import OpenAI +import json +from dotenv import load_dotenv +from scrapingsingle import get_article_links, insert_data, is_similar_data +import os + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + + +already_checked = set() +total_links = set() +collected_news = set() +dlinks = 'http://127.0.0.1:5000/' + +class TestIntegration(unittest.TestCase): + + + def test_integration(self): + link = get_article_links(dlinks,already_checked) + self.assertEqual(len(already_checked), 2) + + for link in total_links: + response = requests.get(link) + soup = BeautifulSoup(response.text, 'html.parser') + + titles = soup.find_all(['h2', 'h1', 'h3']) + title_text = ' '.join([title.get_text(strip=True) for title in titles]) + + texts = soup.find_all(['p']) + text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + ] + ) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + title = response_data["title"] + text = response_data["content"] + + vector = embeddings.embed_query(generated_text) + + self.assertIn("Test Title", title) + self.assertIn("Test Text", text) + self.assertEqual(len(total_links), 2) + diff --git a/pyth/tests/test_vectData.py b/pyth/tests/test_vectData.py new file mode 100644 index 0000000..99d4dd6 --- /dev/null +++ b/pyth/tests/test_vectData.py @@ -0,0 +1,89 @@ +import unittest +import numpy as np +import psycopg2 +import os +from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db + +class TestIntegration(unittest.TestCase): + host = os.getenv("DB_HOST") + port = os.getenv("DB_PORT") + user = os.getenv("DB_USER") + password = os.getenv("DB_PASSWORD") + dbname = os.getenv("DB_NAME") + + @classmethod + def setUpClass(cls): + cls.host = os.getenv("DB_HOST") + cls.port = os.getenv("DB_PORT") + cls.user = os.getenv("DB_USER") + cls.password = os.getenv("DB_PASSWORD") + cls.dbname = os.getenv("DB_NAME") + + cls.conn = psycopg2.connect( + host=cls.host, + port=cls.port, + user=cls.user, + password=cls.password, + dbname=cls.dbname + ) + create_db(cls.conn) + + @classmethod + def tearDownClass(cls): + cls.conn.close() + + def setUp(self): + if self.conn.closed: + self.conn = psycopg2.connect( + host=self.host, + port=self.port, + user=self.user, + password=self.password, + dbname=self.dbname + ) + self.cursor = self.conn.cursor() + + def tearDown(self): + if not self.cursor.closed: + self.cursor.close() + + if not self.conn.closed: + self.conn.close() + + def test_insert_and_retrieve_data(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + data = get_data() + + self.assertEqual(data, [(title, text, link)]) + + def test_is_similar_data_integration(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + def test_create_db_integration(self): + cursor = self.conn.cursor() + cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'") + table_exist = bool(cursor.fetchone()) + self.assertTrue(table_exist) + +if __name__ == '__main__': + unittest.main() diff --git a/pyth/vectData.py b/pyth/vectData.py index dd1e2d7..e99883a 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -3,12 +3,26 @@ from psycopg2 import sql from pgvector.psycopg2 import register_vector from sklearn.metrics.pairwise import cosine_similarity import numpy as np +import os +from dotenv import load_dotenv +from datetime import datetime ,timedelta -host = 'localhost' -port = '5432' -user = 'postgres' -password = 'salmonela pljusti 221 hamo' -dbname = 'vector_svw' + +load_dotenv() + +host = os.getenv("DB_HOST") +port = os.getenv("DB_PORT") +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +dbname = os.getenv("DB_NAME") + +conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) @@ -17,7 +31,7 @@ def calculate_cosine_similarity(v1, v2): similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity -def is_similar_data(title, text, link, embedding, threshold=0.9): +def is_similar_data(title, text, link, embedding, threshold=0.98): conn = psycopg2.connect( host=host, port=port, @@ -27,25 +41,33 @@ def is_similar_data(title, text, link, embedding, threshold=0.9): ) cursor = conn.cursor() - cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;') + cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() for existing_embedding_tuple in existing_embeddings: existing_title = existing_embedding_tuple[0] existing_embedding = np.array(existing_embedding_tuple[1]).flatten() + existing_link = existing_embedding_tuple[2] similarity = calculate_cosine_similarity(existing_embedding, embedding) if similarity > threshold: - print(f"Similar data found: \n #{title} \n #{existing_title}") - cursor.close() - conn.close() - return True + if link != existing_link: + similar_d = existing_title + insert_data(title,text,link,embedding,similar_d) + print(f"Similar data found: \n #{title} \n #{existing_title}") + print(f"Inserting: #{title} \n") + similar_d = "NO" + cursor.close() + return True + else: + print(f"Same source of same article!") + cursor.close() + return True print(f"Inserting: #{title}") cursor.close() - conn.close() return False -def insert_data(title, text, link, embedding): +def get_similar(): conn = psycopg2.connect( host=host, port=port, @@ -53,17 +75,35 @@ def insert_data(title, text, link, embedding): password=password, dbname=dbname ) + cursor = conn.cursor() + query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' + cursor.execute(query) + similar_data = cursor.fetchall() + cursor.close() + return similar_data + + +def insert_data(title, text, link, embedding, similar_d): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + c_time = datetime.now() + + cursor = conn.cursor() cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding) - VALUES (%s, %s, %s, %s); - ''', (title, text, link, embedding)) + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time) + VALUES (%s, %s, %s, %s, %s ,%s); + ''', (title, text, link, embedding , similar_d, c_time)) conn.commit() cursor.close() - conn.close() def get_data(): conn = psycopg2.connect( @@ -79,11 +119,110 @@ def get_data(): cursor.execute(query) data = cursor.fetchall() cursor.close() - conn.close() - return data -def create_db(): +def modify_similar_data(new_value ,title): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' + + cursor.execute(query, (new_value, title)) + + conn.commit() + +def get_specific_data(title): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s''' + cursor.execute(query, (title,)) + + specific_post = cursor.fetchall() + cursor.close() + return specific_post + +def get_all_links(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT link FROM vectorsvevijesti''' + cursor.execute(query) + + db_links = {link[0] for link in cursor.fetchall()} + cursor.close() + return db_links + +def delete_specific(title): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + cursor = conn.cursor() + query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' + + cursor.execute(query,(title,)) + cursor.close() + +def cleansing(): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + day_long = datetime.now() - timedelta(days=1) + + cursor = conn.cursor() + + query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' + cursor.execute(query,(day_long,)) + + conn.commit() + cursor.close() + +def drop_table(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + cursor = conn.cursor() + + query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' + cursor.execute(query) + + conn.commit() + cursor.close() + +def create_db(conn): conn = psycopg2.connect( host=host, port=port, @@ -97,19 +236,18 @@ def create_db(): register_vector(conn) - cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;") - cursor.execute(''' - CREATE TABLE vectorsvevijesti ( + CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, title VARCHAR, text VARCHAR, link VARCHAR, - embedding vector(1536) + embedding vector(1536), + similar_d VARCHAR, + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); ''') conn.commit() cursor.close() - conn.close() -create_db() \ No newline at end of file +create_db(conn) diff --git a/pyth/web-server.py b/pyth/web-server.py new file mode 100644 index 0000000..ae78c2b --- /dev/null +++ b/pyth/web-server.py @@ -0,0 +1,24 @@ +from flask import Flask , render_template , jsonify +from vectData import get_data +from flask_cors import CORS + + +app = Flask(__name__) + +CORS(app) + +@app.route('/') +def index() : + return render_template("index.html") + + +@app.route('/article/one') +def articleone(): + return render_template("one.html") + + +@app.route('/article/two') +def articletwo(): + return render_template("two.html") + +app.run(debug=True) \ No newline at end of file From d4e99c7c5f2227f9e4e6408f044f561e9ed51701 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Sat, 6 Jan 2024 08:17:05 +0100 Subject: [PATCH 2/5] added article.py --- pyth/__pycache__/articles.cpython-310.pyc | Bin 0 -> 5925 bytes .../scrapingsingle.cpython-310.pyc | Bin 3031 -> 6614 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 5433 -> 6045 bytes pyth/articles.py | 231 ++++++++++++++++++ pyth/scrapingsingle.py | 46 +++- pyth/templates/index.html | 1 - pyth/vectData.py | 62 ++++- pyth/web-server.py | 7 +- 8 files changed, 329 insertions(+), 18 deletions(-) create mode 100644 pyth/__pycache__/articles.cpython-310.pyc create mode 100644 pyth/articles.py diff --git a/pyth/__pycache__/articles.cpython-310.pyc b/pyth/__pycache__/articles.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40e56ebde6bae660b85d03f5e389ea869ab4e067 GIT binary patch literal 5925 zcmb7I&5s<%b?@r0>6x9G{r=+eqh(v7Mp-S*W$f6LESnKm5*=$vaFx&=h?q2|YG-@5 zr+Zl4v#TBS2t-!7Brz<*mjJSWrGXrjzaXaoxdsT3bDaX@U=RUvF_44e$oakM*$;lK zk<6g0U%h_ys_NByRliqBtyVGcE4}fz;$OaD82?U{(?185AK*zpWQM^F&LX2(yjjyk z-HfcR-Lx6;tjOuQO;^(_Dm6>s*^$>RH_P2hv!eMCw9W%`t=qN?A-MQvmcfL74;x9B8gnM8xLzE5|x%Zeg&x!JZ(OeQ$ zl;<_K4DJO@RX|ZXg-=-X zWxmMIf%A%ZMfW?6elCm4^6$mvo5uZ1%~wT(FNs&Vv1oi|d~O~vjIF`XgERXS=K?qv zPdE?%hO-S=b;D@*Psr};4W=gBNgBqYm4@9g3S^k=gEI>#r7^gyJ zMP;fdMGqavomL>TupNn1jcY|oU$#XdR40ZM0pHhkjA<_JcUxJqBjU8RDU)t1%}9ru zPoog6GVxK?3bNMf&PYhrZ{Jy4zj3RDIkf&{?I)^q^N(Ba-QM^>6L)UkEr>teSi7sn z@afKtjg5P^@7~m(*KfSP);7sxqaS&yjpuDV>2(yjkr~_U(9Dg;Y{?j!IUCiPu3MRv zn;n*09d=-Hwun_&G=_F=V=c^e33oRdwld?Ma-%Q~qK>lTM0O?h*>IG(dlk>Bs_kBF z5kH8+f#9gFo~~9-*6<5MUO>|?zjI|P>53~s8axQ%@XAe*?qo^tO1dj{!)=jf;p^QX z&4j$t+t0SH9NB-Rx38AkLDcR?K_*(ydB`GEryhz4Ho@jho5@S4v}qPcKN|S4f!;$U zBNJira)C2$eu5gexc!&rqH$<)=P|o!9IzqFtV278$uv{jWiqcF|d zNw@db*HHfrO=HFwyaoqNMN1qx`^ec=-i#8MDSZpXQL`sxg(ttm?@P8kxN$4q#bR+k z+wTc~Gm-vNI#&Gm`)TGs5WWbrEg^k4u@9lRw-WO}%3yz`QBsu<5t?Q}+!o4$gmThe z6lQXkM(k)YRT{~w%AW;MUldVls@hZCL8^}_$n@E1&%JnlJ~~LNC@SnctFS7wnPKXry!l*7_vW5{xL=yCHgwk%L(a1bhomB-j# z$sgbsruGDjJEv+kcTd%v+{*3T*|ZQmrS&fyWp5>ErtDrKQTDO5D6bc!>0Tnaa`}Te z=!(I+8*4vWd;0@F3$sWp`-rlg<&(wm-?@AHeScT9vqYW*hkx(AwYzJ+-go@(`5(x> zc;igx`^{^-vx^6U8UM|dQTMq_Y=c04`Uv_*qW zPrNUEk>8y;E=f8uhL=e-Z3^-`$CR#;% z+ZEFua%L~zg^H8}7mUUj7AoCuC%w+9Dz%e1#sbJye3RFx_(LkbMa8#KD7W30X(Clg zd?ecaOek+tWbLgWij>oil2qUbvPp~l9*Nuf0CEc(Dn}bpRgMGVxgc2jJr;ir3R5za z&CKJ!``=!v!aV%zW}Vep#gy+rHXUu zn!^g@#)j2lEg#FPn`Z9qGTG0)!)j620jR%p*s%VbhF76s92zEulf$XJl8^7eWAll8 za-=EAYI!Z6IxvUR`E+jX*!W7iwS?APKdhhB9=@BGQLodNhp*)2dxi-NXXAfhG2_|V zhH=$M|7s7i^#2K-drwJo_#z}edsbot`e4QJ{eNV}aE2L!@oa*d`OFbU>OhYReH>pu zo&gfl?>)9>G^}-b70dBuNpGCj{l=tx19kZ(6=yl5b~C-?*p@}6$^zS~kT^1xH3)mk z3OLs%qjG~D5ENIARFz=Eh*BiB;xW0^-Z2E$xGx- zD%MaaAf}`((p2yh#I(?nNxuiq2m_paVWBEw;A#;AS5L(NSh}5J+asaXLTL4r(5gI7 zy)yC4hNUiThB1bKM=N=*YZ`m{0~96>j?-Z?;Z>On*l^LTQrO@CUD_V|FA0qIWu8sKUdsrXwi5}1E zI_9#x{+Yp@d>lTExR1@eK6ElCuR`Yp-Zi`@DbD7N;S_gsPfM2}Z4ak27qRIbn9RsZ zyfmVSS3c$yl!Zv9xlC~nk^B#N6N*U)-@k zy>wzzSauP!bYR)}6Rx)O__6%AM_)+t%r(GTEMVN{inYkkJ-Z(0X%6k(|HrD(O3fXw zR6#wpQkcunPpv{ZU(j=T@rqUX#3|WHlEu1}hO>k3X65b5_Skln@=!j*XMbt(xqO!1 z9h1-h)EqA5OWQTF`*wYMe0V;aIGoJSKW4)VxrZF)0$+foFJ@DR)0%rJn?VF!#EEn% zpFVKleHLHLZH!H4s`reC9`@JbxSy=BGLhO_%-6FhZuJ1($v7dVV+95q>Q8QnZPYDrd`p zidkRAevtY<+_=5&Cvmjz?*ZETK=7SN_?JhLm!VBWd@!lI4={vPtvK}ue|J0tA1MKH zhA2w*(0cS82S?%u=y6M=;qIt^DY^;hAl~=mBz|3wNeW8dka;Bjg9Ia>38s47I6ihm zF0^{2+W}3Muo4YB_`eN&<%y;3dhOH)={JM9S;p7z{ch zl^qOcFmp@aE0D2{R`I$oZ{gFIr4I=AU-t*zjX2#C64bf%1k0jtJ^Z807v}EAe5Em` zj~r!1B9`lDFYi!6Nsg>hLCLQq04M7xfJ~zhqXYWf3NkfD`C}xqFo{!qLizwiP-<@_ zxR^*)X*6a%WMyxsNsKcW_bq90x{qmu+IrX*a$lB7dwK1nw&;<%#!KqbSqD9?N78Tm;rWg=(&`M%_4_et?qDq8?sjdm)IOIRT3-nr* zKc=bQr=FZmT*9PCN%hMuZe!`O2;%F)Kos6nde%2b<03&D-;1Q~)*$S59oIa2XV9n8gc~HO#yRTaWIFY#O3+i)2o=Xrvd83!UX30Vp`i5omb;iS(aQ zxIiWqV3WG(A>jr705UNj5y+={6(|L7;1*5eyKNH3V-sjqK?*=P1_=Q;bv6ZF(W+>7 z(vn(3n$78UTFa=1Intu_6uK&GM(bKIO}z0yqxqHlRo$gg(,DLj;b)M7 zu>YsJD}O<=-bOKSe}v^G=ikVJte+w%+F&;<`7Vjj7AfhDrtz9ay~+WO?d=!$3k&HE zwg?%obOkQh+UZI!2<2XgMWh@K>9xKjlzKZb6Zm@qqze&XGDA|{sAi%d?jSJ2c;!^` zMVAK3qeY_qpa?A;RpjzTXc54Zbp4>bEX2JP?)p?cZOfpCg)27HA{r?Mr6fTYZiG8a zIZm923PQ|hBY3*ED$-TT+;q@uvnfX>{fY%e7v-AFD(DL;eJ>D%Mo5I@Yk{RVvfo%kI#nOBFVS{|vK#YqS3ak$(M) literal 0 HcmV?d00001 diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 38af9dbd593bcd53feec35a1f82ca687699f9f61..34597ac35e738ff6dec6066b9f60a885ea55899e 100644 GIT binary patch literal 6614 zcmcIoO>7%UcJ6LAn@x(O{{Q^j{u$d`%akoyqp`>98UM)tmM0_6ddwY%X|ak@OHDRa z-I6V$nK+P>%>wbpyFijnW+##N<{W}K<(4@F_F@y{kmT6c90H#P$R-G21e1KPnw0-$ zfm}kOU%h%&_4?JTSMR62TrQ*F_g@!&%71Y{QT~-Wul~Bw`4Er)51OJdg{iJmQ%kB! zxaMjl4Q;|r)bx@rX}X)N86~5ZDy7a3gfJV=&yvtHKsKq*WwzRPwq3-6nv_klJ|GuWO5!rznQd*%3h?eP=* zMCk;d#=Jdz8Zu1tlWZT~%}=oGhW5Ggg|>lRaEqV7*a^(pNf_bH%Y1a{{iRb&-(^|0 zpS{Bl{7hp9*&#mtxxxzk1mJhRND_5HV}~DVrPKWM(g#c#RhB+nI#W8!&*JIdJjc&M zvh(~r8yi(hGiZ;Xo#5LxFUXOLJjZAF1onQB71{X1dRcYq1mt*`&XBW4W+px~kmG56!`7&@;}AsXCQ-JVzwkb<3_(U-qMP z#pQP02kWT6<}s(b-dQZCLxe8ZvssyW0k5w``5Pb2%$~VihKS{>Gq($gX7^$cH2jm} zZ|wOx6ag=@Em4M6^|8HE3KXUQJ#CxAwXICAUjbFQd`}p*y1?x$*4ZxZIkvtG003i(ljx8o`mt;?$8~Rm^*l;Rk|LWC#vIy#n{N^-BQL@tnisj{pS9 zrW)d6w6s7AwarARJcen*4Aj|z9+`o&9C*vT9`*5h1sbm}l&hX7L;2i|5-tV|Ilv1E z(F=A_G9EHJ)1N3}6!aIT#}{ccwr|hdb!YrM_rbU^?yvDR2O18XBQ@I(xEOD&2aDr= zMc56pAUrNFHrAt|`f9B#Me)m3=%pNQavSY~Hk48Gs@M*sq5^b&DIm0Ex!p@ZnA%dA zhRt@iod^?~8g06z2Z>PMP(y856~m#j040PvQ$p?cv{9v%#5_GrVxAEiP)}l3^nl;< zf5IyYNzo5n3=kM1FbEJCj6-D{mp~8ry&%eUbW*O{H6CSZ_PzFw1f4`=#M_HmS-E?m5H`O4L6*Jp2h z^cx?~-TdTxw?4gHvga#|R~HtYrDeBP_Zln04_4Ri-dk@z{lljZpZ@;SADn;o6?^v8 z(??H#{A}~t<7a>S?5m^4CMJ(hy*vHh2`hSAKoV{ej}-8Cl~B3UMZ)}nvZjbksMb}c zd=JrYMPnqO7OB>()Duvxcr?;0i?(d0mx2!>-_^mJ&=| z*2IT_4m`=U4ccpB)4;BAjTh*$>d?wQx~fYC(^--k54Bb*Oi5jtK`K;2lch!#tW1aL zP+e2SsW7=|hH26ebOox^)jKQO!sHxMI%v$Cef~?x@}g@osJX=k?1zgeWsAVUVMnTc zlZ;?d@Txe4X@yK=3cj)m-{waN#IY!~$Zf`jj~gA>!Kz;-?;z;*#WrJBojRjvC&<_% z!)_qNGBFV|OCa_}-L@;hWF6*<7{M96!f$v8v61d0NJnN{96!p?(ZEZ3f{T;HFDZ{q zOc5RruG#J?fAhFx6mD~gkL^5G$ZMBX;Yjf4IKI4U!sD9gWmN6H_TPF^{||8$GhZ`W z+8k|09wfQPMY_2L-CUuC>QA&4WE?LgC}`p!CPlgv@Y+jqd~4FDV1bCujnj89$0ybU zinvbm&2qVh*j+A*6X?Q@wZ#cM$M8%Fic=A7g@;Bk&18 z`yJ9U-q3_()u4&wgq!)B6rg=;)wdV;NekhB9f7p=;_!K-T$Ww8-E~;#n03WNGGLQz zATn!~-}NorbWb4aKG7mwc(CGB#j7KC3LLn3Go19J6jhVZi>lXno9U-Wtct*R8r&KT;JoMSuAK>KE0j{;b z=(#M?W^cS0lnF2PYLPj*oRo3Ws$kL9Z$B_95T)V&Rf0SiLtdpSfeNp!IBn1UNWJq5 zcr*oN&c9Ip#gJ-jsI7#=6B5_M1W}nTlMsCiH`;W}f@h2Agqv+TB`xVTo$!qGbBeeT zs$qImL8g}qlT{fJ>hO$&FW?4*=~GIGfQnec(96h~W@v29l(?Lk!Ax~ie@qeRm!D~^ zu2wckhS?33CE=a_J=&smUSGF)eI3TU(3i8nl^3K>L8fhJO|;v? zZQ=G|rL`m6L3_X%?+#Pgfl0;KmgG4pc``JX6PU3hOiKQS+$BjVWo3C<3Hh)V@+Bc( zXT{XFuW0^GU@oPXGD}@l*i%uD(zHJup%%sH0OH}!aOaXdqov$ZzO_qQ;g4V?8CiC_ zffA;r#dgD@1~)XsMP;)mlokVP9-^`O0bs*nnsy}TDr_XoZ1uKQ2Hmj4U&liSk5Re0 zce5|tg&4j&Ot-qi-E14|)(ZMJ2PAb*Fc@})RyZ2&VcXdbSqA?N+u7P140Y(?-gX(> zr7OWO%HV%imDWBggYk}C;l7S!vJA$^Z|_^_VzQ+&m~h1T-LuV}a};06p|#6ZK~=?b z3CN#6$BkCR4uEFv5*OTZ{Mc7D7skjb%sb?jq<@^W5SRtB#xXb@7rs_bj9ppHMztsX zVB0Z`!4Xo;M!qceWOaAF?Z;mCT{5gSd*#b*2rwzx`c^Y3JzeuJw&ZxjLz7GO;k)2z zZHT(aeFsii3L(D5LBHad=RLU7csT`-|BD~~bw~cB!Ya9Z+(`Dx+Gpkaw^t^Ny4Ct zRigCFy$WwoG%M_nx;r0gItgxeK_J6K>MuX#Xuch#s-%~45ShNz3ot`woz->=ajP<8 zLQ)d9DfB5fh*BWP?1=&{O5at)Q9nwOc^!*WRW#I_h=;< z#PxI8sp2z*;8Ct~A7cr!l-QIzN|V`esv5LeAaA3K53zk(6=hzkKFU*pC@4#b8Ra^u zw^T@+!W(PIJ3JAkJIZ5``c{INCDxq}SAU7^`(FZ>sL*vSlR%yR#^2Ni)VvnM9<4J< z6%?i4rrLuii>h%*HMEgL+;&E3b_O#HtuGO`8fN3cYV8~2n|#W^SAjT<$0rn|^_0-l z-_$Z15`BC<7(m;j>i2svnpMsF+Be!4`u~WZLFj@iFH_-Kl$iIA$F=9YI-dqnP?q1aE*11S zAnFRd!0vp7%vX@^LNpixxlLL12r81IBqO#_26wvAu8@t@2JQ=wx=`G0aWL6@dpHda zZ4^ZeeHTIzLq;j6#1}IJ=!1f|K;R4j?h4%-UyM;FBRjD{$_s*zq;yXtMNs`AWTmh| zm>}>c1bhMk0ZMhnpAxuB;HLy+4!lm-pAq-G90331{$35lIA;v^0s38ZxmsA{09T8W}opl(Gi$MfW@S?@Zt z8$PT`MhcgzLIUlPA{lO+`xo@yTw8VihN_nyka|FUvknFI!mQ@^=DqpNn|UAF-?o0w zW!!YyAUJ*~{RlTwPcqZ=izln)9wnS`X2;Dq%9yRU6J{c`X|~Z$n#ph-w^Qx3nQmvy z%n5(U974Ws54T6m5lUd>h##ixH_{4}3|m8XQy!Wv@XSxi}* zj29mUdIIGF15BL1wz{tg9q|SNp$hcfgV++cdZ8^6+8C60W z3Tt=j_946^q8NN0+Z`Qgf?ul(-)h)U3(1o~GFhVY!TIEcS`mE+1tlig`i6{JKD0x_ z0Or&kiK+*F!)f$m+>*2q94D`DEhu8l zw|pD^@7bu@mN#%+LDt{O^#l={!Sz((+8QdNtl%s{i3#OMWEK4PY@r?~HW4nW#TG(0 z-R$_~t@ZO||4{6?QuFS67*sd-CACIN!C$FYJq$M1b~)HmZ-eL6o8So*n%Gh2i!1LN z-T2okno6NtEWdAbjZa+12gff|3eDqR)!6>?a*4`>zc0YscWvGsyCQ@u3YJq)S+>Eq zT&GefU6heMZ(FFOh~c1;n-BJKQ;RWF<*Z$InvH!-{ra7-Z9Hrp@YXxcff{_D%U_kz zmghQB<1XHRMIWvZhHc@?qygWvM8lhHGU5u@O=wbXQ29_vB;>+aQ z;PDv^0d*gw@zyBgjeXbZHNcbcld^>clbI$sAdt;ZF0kD?_hcFq(s3ORYP{~(!vf1# zM_3qeG7PwOa0ueQ)CMV&i7WRS&{4UU^xaMkn@#nZ)T?c`#}93|L{as+2q~h|6n+C7 z4yDEnrZJ7`Os85zNo`nX28~jKvIiO5)tNezVOg5O$&km8HBQ-U?R6%u_I1=V=uJ%{ VlEroO!7ySfLG=ghb@Y!Y`xiO0xPAZt diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc index 4104298addadbf5270d81b2b0903869d4f76c93f..ad5b7d5761068ede0ccc9fe7203405c2138f4c53 100644 GIT binary patch delta 834 zcmYjP%}*0S6z|(^yR%EX?b2GL)LOn-sFD~CAZX;HLJ+CY8VI1IVON3yOQt0eg%Cdy zPiXKZ#tTMna>Mjwj0gP-h(P|=FZCBgD2LKKWQV#P}p$k@A zkj4-00X3DpI^3P8H#gB|G-IE#qKV|)v~t6^t+dIO^M-;RR<6o}+)KxI9^+}2R^l;k zXRJlM&l)_jcx}#X*J3o%_?kU}5o}W40HHIsZb>b z3}oQl!Yl1vplTspbq8uJ=Y|B1C3vZBIF_AcwG9XDokabR!|z^vTM~91FnFnkaKA(e z7k9C9vD#d_2|IF{FWCGn5$2t^s|SMmw4I+U#!On7d}f(Ed-z=qX`Uk@y@E&;?;+1o zA)XK$*l}IV%;x?}7YvJRx$FX;vWw|fMWH?c{b*<;r=ut*p=G5_4gJO$lKxfeseC~p zdm#c@lHh2%|1d?C{1UY)DgO5HOB7)`4;$W!x3$Qj*mbQ;;RB-1WuYosI?wI-44=6* zJ(=P8nWSCbSNuOE8F0af!2~`t)*)GFDXoJ@ym&G9#mU4rIn4hM>N@xg;EIJ(H!-|3F! zAl`X;iel$_(`w`z$pb*y#=7D@8gAip5F2zES`n)?Z?Uqt;L)@2nXa+@HKSFhwLqHw zho=iWp`Hq5FL)H`DXifmL+ZA|If1Hg^78HhRm2Unq0ydy*=WmLx4p fs!FO!s*{^8!lQb6HW3pk5fe5eCZZxKY{R(&ft+L_ diff --git a/pyth/articles.py b/pyth/articles.py new file mode 100644 index 0000000..346a917 --- /dev/null +++ b/pyth/articles.py @@ -0,0 +1,231 @@ +import psycopg2 +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity +from dotenv import load_dotenv +import os +from openai import OpenAI , APIError +from langchain.embeddings import OpenAIEmbeddings +from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data +import tiktoken +from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens +import json + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + +print(f"Checking for similar!") + +host = os.getenv("DB_HOST") +port = os.getenv("DB_PORT") +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +dbname = os.getenv("DB_NAME") + +def calculate_cosine_similarity(v1, v2): + v1_normalized = v1 / np.linalg.norm(v1) + v2_normalized = v2 / np.linalg.norm(v2) + + similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] + return similarity + +def parse_embedding_string(embedding_str): + if isinstance(embedding_str, str): + numbers = [float(num) for num in embedding_str[1:-1].split(',')] + return np.array(numbers) + elif isinstance(embedding_str, np.ndarray): + return embedding_str + else: + raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") + + +def get_titles_links_embeddings(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') + data = cursor.fetchall() + cursor.close() + + titles = [row[0] for row in data] + links = [row[1] for row in data] + embeddings = [parse_embedding_string(row[2]) for row in data] + + return titles, links, embeddings + +def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): + try: + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + with conn, conn.cursor() as cursor: + titles, links, embeddings = get_titles_links_embeddings() + + processed_articles = set() + grouped_similar_articles = [] + + for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): + if (title1, link1) not in processed_articles: + processed_articles.add((title1, link1)) + group = [(title1, link1)] + + for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): + if i != j and (title2, link2) not in processed_articles: + similarity = calculate_cosine_similarity(embedding1, embedding2) + + if similarity > threshold: + processed_articles.add((title2, link2)) + group.append((title2, link2)) + + grouped_similar_articles.append(group) + + return grouped_similar_articles + + except psycopg2.Error as e: + print(f"Error: {e}") + return [] + +def processing_similar(): + grouped_similar_articles_result = find_and_group_similar_articles() + + if grouped_similar_articles_result: + + for group in grouped_similar_articles_result: + articles = [] + + if len(group) > 1: + for article_tuple in group: + if len(article_tuple) >= 2: + title, link = article_tuple[:2] + article = [title, link] + articles.append(article) + l = len(articles) + if l == 2: + print("2") + a_one = articles[0][0] + a_two = articles[1][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + + text1 = get_one[0][1] + text2 = get_two[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + if link1 != link2: + link = f"{link1}, {link2}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + tokens = ftoks + stoks + + similar_d = f"C: {a_one}, {a_two}" + + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + print(tokens) + if tokens > 2000: + combined_text = f"{text1} {text2}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + + if l == 3: + print("3") + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + if link1 != link2: + if link2 != link3: + link = f"{link1}, {link2}, {link3}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + link = f"{link1}, {link3}" + else: + link = link1 + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + tokens = ftoks + stoks + ttoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + print(tokens) + if tokens > 2000: + combined_text = f"{text1} {text2} {text3}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ]) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + title = a_one + text = response_data["content"] + vector = embeddings.embed_query(generated_text) + + insert_data(title, text, link, vector, similar_d) + print(f"Inserting combined: {title}") + + except Exception as e: + print(f"Error: {e}") + print(f"Title: {a_one}") + print(f"Answer: {generated_text}") + continue + else: + print("No similar articles found.") +if __name__=="__main__": + processing_similar() +ready = get_ready_data() +if ready: + for a in ready: + print(f"Title: {a[0]}") + print(f"Link: {a[2]}") + print(f"Status: {a[3]}") \ No newline at end of file diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index 8e65beb..44ff2eb 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -4,7 +4,7 @@ from urllib.parse import urljoin from openai import OpenAI , APIError import os from langchain.embeddings import OpenAIEmbeddings -from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data) +from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data) import json from dotenv import load_dotenv import tiktoken @@ -48,6 +48,19 @@ def replace_with_spaces(text): cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) return cleaned_text + +def fix_links(links_set): + modified_links = set() + + for link in links_set: + if "www" in link: + modified_link = link.replace("www.", "") + modified_links.add(modified_link) + else: + modified_links.add(link) + + return modified_links + total_links = set() collected_news = set() @@ -78,13 +91,13 @@ for dlink in dlinks: total_links.update(temp_links) final_links = {item for item in total_links if item} -i = 0 db_links = set(get_all_links()) new_links = final_links - db_links final_links = new_links +final_links = set(final_links) - +final_links = fix_links(final_links) if __name__ == '__main__': @@ -142,6 +155,7 @@ if __name__ == '__main__': print(f"Error in completion: {e}") continue + def comb_similar(): print("Checking similar") @@ -185,12 +199,17 @@ def comb_similar(): combined_text = f"{text1}{text2}{text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" - link = f"{link1} {link2} {link3}" + if link1 != link2 and link1 != link3 and link2 != link3: + link = f"{link1} {link2} {link3}" + else: + link = link1 else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - link = f"{link1} {link2} {link3}" - + if link1 != link2 and link1 != link3 and link2 != link3: + link = f"{link1} {link2} {link3}" + else: + link = link1 else: ftcheck = num_tokens_from_string(f_text) stcheck = num_tokens_from_string(s_text) @@ -198,12 +217,17 @@ def comb_similar(): if fscomb <2000: combined_text = f"{f_text}{s_text}" user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" - link = f"{link_f} {link_s}" + if link_f != link_s: + link = f"{link_f} {link_s}" + else: + link = link_f else: user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - link = f"{link_f} {link_s}" - + if link_f != link_s: + link = f"{link_f} {link_s}" + else: + link = link_f try: completion = client.chat.completions.create( model="gpt-3.5-turbo", @@ -213,7 +237,6 @@ def comb_similar(): ] ) generated_text = completion.choices[0].message.content - generated_text = generated_text if similar_article: if f_title == s_title: @@ -222,6 +245,7 @@ def comb_similar(): similar_article.remove(sa) print("Modified") else: + print(f"First: {f_title}") print(f"Second: {s_title}") modify_similar_data(first_t,"SOURCE") modify_similar_data(second_t,"SOURCE") @@ -243,5 +267,3 @@ def comb_similar(): except Exception as e: print(f"Error in completion: {e}") continue - -comb_similar() \ No newline at end of file diff --git a/pyth/templates/index.html b/pyth/templates/index.html index 9b156d8..c9e51c1 100644 --- a/pyth/templates/index.html +++ b/pyth/templates/index.html @@ -18,6 +18,5 @@ Second - \ No newline at end of file diff --git a/pyth/vectData.py b/pyth/vectData.py index e99883a..35a642c 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -83,6 +83,7 @@ def get_similar(): return similar_data + def insert_data(title, text, link, embedding, similar_d): conn = psycopg2.connect( host=host, @@ -97,9 +98,9 @@ def insert_data(title, text, link, embedding, similar_d): cursor = conn.cursor() cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time) - VALUES (%s, %s, %s, %s, %s ,%s); - ''', (title, text, link, embedding , similar_d, c_time)) + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) + VALUES (%s, %s, %s, %s, %s ,%s ,%s); + ''', (title, text, link, embedding , similar_d, c_time, True)) conn.commit() @@ -121,6 +122,39 @@ def get_data(): cursor.close() return data +def get_ready_data(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + + cursor.execute(query, ('True',)) + data = cursor.fetchall() + cursor.close() + return data + +def get_source_data(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + + cursor.execute(query, ('False',)) + data = cursor.fetchall() + cursor.close() + return data + + def modify_similar_data(new_value ,title): conn = psycopg2.connect( @@ -138,6 +172,24 @@ def modify_similar_data(new_value ,title): conn.commit() + +def preparing_articles(new_value ,title): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' + + cursor.execute(query, (new_value, title)) + + conn.commit() + def get_specific_data(title): conn = psycopg2.connect( host=host, @@ -244,7 +296,9 @@ def create_db(conn): link VARCHAR, embedding vector(1536), similar_d VARCHAR, - time TIMESTAMP DEFAULT CURRENT_TIMESTAMP + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ready BOOLEAN + ); ''') diff --git a/pyth/web-server.py b/pyth/web-server.py index ae78c2b..ed1dc44 100644 --- a/pyth/web-server.py +++ b/pyth/web-server.py @@ -1,5 +1,5 @@ from flask import Flask , render_template , jsonify -from vectData import get_data +from vectData import get_ready_data from flask_cors import CORS @@ -21,4 +21,9 @@ def articleone(): def articletwo(): return render_template("two.html") +@app.route('/data/get/news', methods=['GET']) +def takenews(): + data = get_ready_data() + return jsonify(data) + app.run(debug=True) \ No newline at end of file From 96a2d888953319b0b2593dd504ad461f6073c4b5 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Sat, 6 Jan 2024 08:26:31 +0100 Subject: [PATCH 3/5] Removing previous f. --- pyth/scrapingsingle.py | 112 ----------------------------------------- 1 file changed, 112 deletions(-) diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index 44ff2eb..ac86b52 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -155,115 +155,3 @@ if __name__ == '__main__': print(f"Error in completion: {e}") continue - -def comb_similar(): - - print("Checking similar") - similar_article = get_similar() - - grouped_data = {} - - - for sa in similar_article: - if similar_article: - first_t = get_specific_data(sa[0]) - second_t = get_specific_data(sa[1]) - link_f = first_t[0][2] - link_s = second_t[0][2] - f_text = first_t[0][1] - s_text = second_t[0][1] - f_title = first_t[0][0] - s_title = second_t[0][0] - - if f_title in grouped_data: - grouped_data[f_title].append((f_text, link_f)) - else: - grouped_data[f_title] = [(f_text, link_f)] - - if s_title in grouped_data: - grouped_data[s_title].append((s_text, link_s)) - else: - grouped_data[s_title] = [(s_text, link_s)] - - for title, tuples in grouped_data.items(): - if len(tuples) == 3: - text1, link1 = tuples[0] - text2, link2 = tuples[1] - text3, link3 = tuples[2] - - t1check = num_tokens_from_string(text1) - t2check = num_tokens_from_string(text2) - t3check = num_tokens_from_string(text3) - slice_if_more = t1check,t2check,t3check - if slice_if_more < 2000: - combined_text = f"{text1}{text2}{text3}" - combined_text = slice_text_at_2k_tokens(combined_text) - user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" - if link1 != link2 and link1 != link3 and link2 != link3: - link = f"{link1} {link2} {link3}" - else: - link = link1 - - else: - user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - if link1 != link2 and link1 != link3 and link2 != link3: - link = f"{link1} {link2} {link3}" - else: - link = link1 - else: - ftcheck = num_tokens_from_string(f_text) - stcheck = num_tokens_from_string(s_text) - fscomb = ftcheck + stcheck - if fscomb <2000: - combined_text = f"{f_text}{s_text}" - user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" - if link_f != link_s: - link = f"{link_f} {link_s}" - else: - link = link_f - - else: - user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - if link_f != link_s: - link = f"{link_f} {link_s}" - else: - link = link_f - try: - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": user_message} - ] - ) - generated_text = completion.choices[0].message.content - - if similar_article: - if f_title == s_title: - print(f_title) - modify_similar_data(first_t,"SOURCE") - similar_article.remove(sa) - print("Modified") - else: - print(f"First: {f_title}") - print(f"Second: {s_title}") - modify_similar_data(first_t,"SOURCE") - modify_similar_data(second_t,"SOURCE") - similar_article.remove(sa) - print("Modified") - else: - print("Similar list is empty") - - response_data = json.loads(generated_text) - title = f_title - text = response_data["content"] - - vector = embeddings.embed_query(generated_text) - - if not is_similar_data(title, text, link, vector, threshold=0.98): - similar_d = "NO" - insert_data(title, text, link, vector, similar_d) - - except Exception as e: - print(f"Error in completion: {e}") - continue From b7a0e5478c4999da1aa5f6966f09cf4159c1e51b Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Sun, 7 Jan 2024 03:41:32 +0100 Subject: [PATCH 4/5] organizing code --- .../scrapingsingle.cpython-310.pyc | Bin 6614 -> 4361 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 6045 -> 6449 bytes pyth/articles.py | 173 +++++++++--------- pyth/scrapingsingle.py | 16 +- pyth/vectData.py | 168 +++-------------- 5 files changed, 122 insertions(+), 235 deletions(-) diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 34597ac35e738ff6dec6066b9f60a885ea55899e..2598ed55e69e99b6c90bdb13b4eeebcdb0e7a35c 100644 GIT binary patch delta 1035 zcmYLH-ES0C6rXc;pGBzw>=_&hOs8PJex( z6c>xOq0aZtef+6wzw`z?*&q4^#`pjEvEySLsv4jDEcT!!x=8n(>E_wyIqJ|pdXn~k zN9X`Oh3`Eu=parhtUS!=Xr0iv?~&RJ&NR>(PEmph()a81difGIzr#Mm)iqAMMrCw(821Jb^L`cUE(Tk9}32r51y|3I#Mv}&a zl5r)cS-<$bu%bHOboz^-O25#Z-3D6~*PPzjYsz8QHLNJ85H_G~y+M2Cbns)^T~!ck zGb4wX_L6uM09fVKNg$2|KQD=+iKw;f#h+ zk?(%XT-N4-80)@#XHuIp8mbyneipSgt3aAbkc4XAA??_ylRmr5>Br}4Wu@AYtl#hw zS$MUpn#*VCB`EDzLD}Iq;=o5PJKAK{G38N~>KGwY)$M7)V74GVp>FN0&|9cYubbh@iPByU;2PNr(?9?x{! zRqZ$)RWmD@3x`0WmLN+u;D}Q=pukETxWOR;gb;85sgE2$IYk1YMJp6|uV%)q!g69p z^}ScGUcG+x>eZ|I`p!R0W!+3Bslo5#vk%y_z4x<2Pe~y_O3?t`3u)CtUPVeHL|ABNjmj4ZT=$F=d|%7AN@(6h>+(&&iHY3H$)~l zneO{2G|rL22=u~qxyDKkqeYM1@pu75d_0&o_mcy`qUn%5!5>XYjs|ZheoKxAdy)r- z)hx4OJG5A;)p!Jr7^UT9=1@Kv@Z?Np8bz_1?JNm?B=}kKI=L7Or)J1l@Lp=y=L4V`nsp{nOYo~Wy$JA^8{ho--(S1&;^Y$&CcH(?#G+fHp?>M| zCw;S=yWFxJtL)a7Ys|AaGPs)TO zw>5rQ5}7Dyn-Ljr%2-8z3|={L8ApDB;NWphCY36MUQ(%=(%8~eywXg9CfSTW*5Kg% z;)d?0{InO7=?y|-OFI9nOrx_P&-hv3!8zb_eja#O4Dih^zuPlp23FhyGzYX7Xcy2v z*&};pUnS!A%YJGsY5dnR^H7sHXkBnXBL09Emjj?VK!L*PH9&bk`AuaJWVE4-sT{!`Zjc;No_MPRn|aG_v4te7q|hLz%?mdjSkA>R&Q&jmxFZt zw2dYnh>npca0NBJYqML9JSP6AO!z4|O1DAwT3*j)uafTY`eaI4vLJWR?Q}?0*q_j0 zf2Y^qk;|R!0-K6z-T)NX-x1BYU}8wM*j37 zD=*p3f+g&_T`O@;;6OiDfHmql)<-@B(8!!;oLRQ8u+mzMg$XPUw`{1rR)xEDYswPt zDlaoJVO8v!$2cf9?|2hd$)Q%gw8SijtqGXqc+3GYXWlAztdg)U%w4`@xlV1}TC=@H zYpks@26kADsv6qpQeQ8D2UE%cVKrjPa^t&Oj#%Czvnp<_=C0w2z|T{jfGXy>Rtdhl z$b@~n?FDCb7xAUwOt^t>nRKK>d~re&9MqP|J)r|CehA{yPejpAm)sHwaCL z!lwc-*$x2DcDln+tE+YM65jVoZVS zn`>>mRkMYMo6G9U-nzOUco|JmfRns}>;k~}U>IBG5K^8%5x!OU3_uvO;pP#%4+Yy1 z@PY`9a?NH=@T1&tGhSYFZOD!=-oA=jAIT`S)^>!^s&E~+dfgHgVUpEvF-l=4MVL~2 zab=b9b#9{hIL=GV-YK(Xyq(5(hq-nJ7fal;;Z+c(7r48+3{L}E@=DE67o{*>!3Y$+ z&=e{Hz!`Ya7b|TlLZhOd5^$NeXdL>s2j=1+Kt|o5q2aB<#VEoUX33$}a{&nUCR;rq z*qdtgnp0uiJ5wa{F;P#9(74UYp3B2TCw?>}?3ITHJhQ-d zv90kv@IX8TFm;1O^<=~#=~w=y-b=E23-daRE-3|5H;GA-(DKmIWRUa|Lm!N^XlE3S zB|*c`yCW^CgElmnt-my0X5)sAHj?U})w695WO`DMlHf(Ye?)&2{TKfboE#_Wpp1Vx zjNB53IaW3PMlh1~V*EDn!GrARQSPGj4nSzo*30iCb|S*#VFc5CE2sGZR6UAtF!)Dy zhj9_vGr?%?;uvlXKY=idpk8oikvjzdDTC=F_(X6&w_~2;JCLi#+zN6W;adm-!9&1j z8vi!J8o~nv^?qAN?gNDHAT$sjBH)69xANCWckm=Xakw{vQPweT5S#vX^kBunn4<{7-{yOwq{L?}3HAktbB~_)e?r1f=q|375 z7_~?#BFm~1Eyd(JX30bzapJW^DIxo!&QNWrG*nBLlB&WJdkRu>f0)Ids-=`HnJA6O zQXHjGSxTUEN|uID8so~t(>%pT_$WWc$5`^qNa+k4W~tAW(o1ZFjUqqG-eP0y^yg~n z96Q5aLd!Ti%g!O6VB>58`OEBOb{_e8HpyN=KFMBXuOWYhS?mJxSJ~_ABJ$VR8|)Ht zi(O_{kY8X^ERFni_9pul@{26Pt|EVfO|u!~m-r=;im3%9eeD_PG?!L=-F@sJ8xee? z>Uvz1w|K>C2sE9lG~8;Pm)&Zu>e!;{ZKH9>Y1pjH8Xm82r8VDVw#U6{jr(y*8FxI} zk1f7ezOzy&%EG;sd{9^|Wb^(Iy6(*v3LmcI7iD*DemR?t_{L_#^?ak*5J+1t7rxoF zUH3^tFh9!H>UND+H2UPuFM`~_vw+9FiKMM~%0qQuYb#IH^UAK)Ry%c7)^$&BYa41? z-%$57rcOW$6Uwg9HlPbFCjccSZTMQf=|`Pv-F7y7quvlTL2Kaiw5@A+uKUAV*UHpx zJJlV|P`%z;O&n_KKoKwF)o*UhY&L3q#&+#ByI!4HnoU}t zX>NO)Go<=O+q0*e+y42A?NnM0)LTBX4>YxFA{kXj)w8Ohib+%|Bt`fmWpx#F2NjQ0 z1mVRvQb|dWwHW}NhDk$k_ ziMFGQ$|ozkv386B0=uSX>_<@6CtyGOQH+SAW}qkG#oF<vCnAH+sQq!tQTjIw(^z2qHXgllp!S=UgG6HVZ6f10Ah8yJ;n?i#vu)lS_3lkq4mNv@ZHuZl$<0SpXARHX17qw;Jn~ zOLe!=5*2>oc6!J+Dvf&Gk5*d3Z3sWcKjW2_$9;31dzDSwaU^KO6jt|R>#%O{s5D-G z2oS?!W%?99(geIcKT>fTE>92p5hFW)BT4ATyAp$XHwAY$8x9lKY2wrTGgt$e zNq3Cvj~{I;d#$F!{ZzLnSY@nR@4gNkhf~txVuF^5cj&In2H`wXbwjTq`xX)t zC^ms%QyoJt%fR%gdPYqVw$-TYi>lf`QYjNHVi7M@@N|DVO7sxKpTCKu191@guG$8d z5qkkjiiA1b!M;I2YlDP9^gG2q&~ybj9xv91ce44cbvOzLu$FSx)Ro-Im5gKiQoNyF941p$+bZYRlqXY5AmuP)}kB zt$o@c;}5mr-bW?W25F*W@PQ^sH&PQ!V>;MB;Cqc3JvE(0dTNF=YGYl8=AyZs_xm(w z5ezbiWcoq{3U0G)>w8vFwD|0a&f|Hn^JuF~nNR}fai9ivOJ46l6R#rkbC zKIi-k6tqlhj3Nn*G^M3rq6tmp(b(s6$lp-JP6p}TLK6BMPYXTGzQF*$j+Zdn5vHxj zo@($5+I~bDXOuh+d;xTn)^j^QPgtGbQVu~Q*<5ku2%B3|Qq+t^q?j25WaiLpF#@4! z#*#2hcjvx8e|I%ou%<4#8LR)XGM8L>(z9}KF&z_82!?{y&)GJ zGBeX?+mGdDJ@?H3!}M-zz#zvu^$tmg9COK?MVx66)krY~vEra;DH2V-AqA{q$a4y_ zh-{POpvd+75Dg~^C_X|juqr;L9D(g^grP53afDV3!UVZw3$KX>lzfV$?`T6hB)*^x zeolcxM~~q1T^wTI;dC;lb*W4AA)M=&4uCMXdT(*Qm_2GNg>3QABEZE!c;MZn5d{qg z#sInnG1`IQ$Lst_8F4gpDgjPWMgW`v%$%+@Sap3n^kP!fACnw(SRt*BLcfBvdZNgp zj9$X@jTeb>xZOv^5I@2U!lq>4^TwKjBhseGs1w4uU(hf@xZp_oh?_Pd1hhMiN@pWf zYy`p#Yy`rKbyY%Im0YKzH3&DzX+nkveRkQ$RNY9R$BTA2>Ugs~B51(=#=w^N!)}vT zs_WHCxrsCNpbAb&AciBzynhPGx}ioYME?e915fv-qeRaF9+%>qqvlP4S0}uXu#`ti z=%^lTH?B_{nD$Xi0K;XfQGy0*PE+%&*+Diu3Q?&00&ht?Wljg<|0d z!_OiP?2k<(dk1o9CrguHqR}jS_L{^0K;K8P90@~*4hYjgwNDro4P9XfK_L&|v$4U% z0iNz5sFjXhgUhFukOPz;;tM0aOAGldxvT6)#cZyyw2~tM>6Fzcq@BZ}-MFpuN>?98 zh94gIt*P!?7Jj$}TB~lrO;>Hlx|d&Cp3i@5eUSY)(>E%xnEUhjg*)^4fi6Uv*0DX3 z%#Q8p2H>3?o4WR$x8F(k&)f9{$4^dyqg7m5&K8RE%lE9s?Cts0yG3hZHJ{Jsisf!k z|D=J7d~aptZgxJ`m9P`jrlW&kp*`(EjCx$8>c=aX6=cBH{)pinb|a+VNdhfRwBr0i z&N6)&SGewWrP18DF31A>Q_YRwMrOL(PK)~E?jsnKs8828ZuKhebknYiCsphd@uDBW zomq1`xNOrJF0O;+<+Gp|DKKoZX+H|`#htXEJ1~)=WP}nr#E4T!{21NL;DR*}e3IJb zIct$hbh4Ib8yRp53T|ZpD8n-uu*mp8kXw`-E?G~Sx-=0fc@K3;%8)mj<{V<8mcwt6 z;kY>@rjmqZ1#xB)5m8i2XbEVL{)VQh;XnE>(=?-rxH*Y@*i4vln+a literal 6045 zcmb7ITXWmS6~>Juc-6&px#+zeLv4qPvk~Hv~;v4 zL@SNfq-bT(ni8!nTGN*F#WAa3O<0rGlr_zAj}wg)ZFQGocj;Dfxt`sznx5UZ9h2Ms4mz`rYqBPDeXFxwR6?DZzUA9(D@;?#EXOy))Y6US?MkgK z8mpCR)VNnGSHmoZRu^lvk1N$BF}$*PxBN5_YMZX-hg#R=D7QR|hkDoayoWAlVUn$N z%(m51Xvw`_0=0(YCJyf%6oKSRU&#APAU%;!N_$Em_s(T;uKH@AY{-GSAwN=>JPQ;J~7hmyJU-J`wlF5$~dx;>ipL!zgC4=O?&Xgx$30i6Nq=M9v^!e(Z9_UO3Vrf6K zpG8}n1E!JNZHE;xj?(GaXV0g?0uF0DoF3;{X!rGGJ4b?{|Nf? zCs;C&zL8ieNPi=Js}VENr;EPyx2j-jW-pI*Og@47z}sw%X5?sOxc9+q|H^NbgQF}D z>lRjc9&f#SrB=REzF9YX+jp#z)n2n0vpX9lnESqQt6I5hM8qGM=Bl8@cf^hFh>3{>AjK(LZn51HgifI&KvbDuMmxn3qORKfzTcN&g z`K?XUariMBPnevWJ7H=a#DPQwFT$)X7S^O?hlwt?JARmGIj(0F3t>V45^BEnC8S4a zgy|R^sCtuI-lprYP@~21^YoNs)|c=Vc+qA&(+f`=?ri#7UB?Ow@ksQn>G(XpJj};^ zv;_VNJtp1LQEB`7KXoSCoTeNV*Khl3+J0a)EUy@I7gs75OUA`o`C{>DmJ<;C92G}V zNLY|xpee6Y@g|C*#xGM_7&q}?%i=p>E`A(XGm3w9!}9wExP>b{vezi~T8ub<7sEqd ziJTWjcHxz(XpF%%;ivX>;gu47U$mcL@bEo9q6JB&uBV;qi zUuTToZ+QhVh@h)i4yoOki#&rtLMR!;!jcj5vot`z1Q*h_N(Q$~wo{Dn{n6r`d*zxj zcfl(e!-rA2;L%aMDkj&9`4=%$5}<@?$9*_37}W2!+qNIsOy?y`;g_hGr(%JMpHV@^ z#N;L|G`gH3#y~&3h|`a8c%-ILyXk6DnN+p^>WQ2pW*(^s{x&X;Ielq}(*nllIai9> zPY)1}Uqwf#k&_ywn@)72fi|Wb#WhkoX_zQ=3;S3TBN{agSNwe=zjM zL(CEEc&?nI8SjEDSCLm~bq?7PCVY{KVS|MQLO@(0aM;6?XX7=O>*MeN<{XIwU&3f# zA-qf{a+tVfI*4FLE@DEcUIa3+ipRK&-=^Z{C=TUkvERIg(J`1Sfg=2-3cslZD*UEK z+Fn-#?h0wXr{uoZymoJOX|Y~Dtle6vpTz#r9(qnJGc{{tR+If4~cS<7HP6sRZ0l4Rel^okwaJ1KAk&LrFMOnf4VV zK~h+f9?5|7U^0>D1-P?4;EwjTLpG#bmhu`}sW*dS6OaEqxd*#3X;%7;@<4%AWRd*} zOt8;W=6@6gnGyD+uX)Hn-FEl-pXqLN>lta+AmosfHfcrP{58kLX2!cdA5V^qa>)7X zY%RKE)D-!l8~%wNHQo#vc^du?W}xUc^w#voF>}eqjWNUjW$uVSy0iqw!XJ&@J{%HZ z@drAfUS{?JUS|UPZVMa5KArzU011yH4|yD2lM6~t=ARHaIO5+BICXT70Y{3`F+a@| zraqQWNFao4^g zd^n{}oRW1;1|Q^4oLu_BgqJ>F>@*#>vk^_A3}lpgX>8NX4nBlz{1c-Q?S}mLw5}_f zLiz`?zr>s&WL0#;wEKY@2q_AIea(xcw(l%oU#eDCjr!t^J7r_}mQns>xmK?o3XE5a zKX6MhKu8^I3C2dc4C8LM>6>egr6EFiC-9;pnWU~M*mDsqpAjqv;|Mv57#506;|SN^ zR{?hj&i>ZitsiM7?mTyAgfvneTsm;k1PCgm)iu=bibs<#W5`&38?k zKeS;~a*<)8v(@hIM4ws|*Tbht@gbB`TEvChAyS7)ge!a>J$es*j^>L@@-(&RvxC?p zi|v-!!SD~LkMe4fABs#&#~0Kt@};P``$Rbjz`>llVNkoi62M r`qOSHhwmUsH3OAK|B(b$&!qJ8`jmcLpTvJkpT threshold: + processed_articles.add((title2, link2)) + group.append((title2, link2)) - if similarity > threshold: - processed_articles.add((title2, link2)) - group.append((title2, link2)) + grouped_similar_articles.append(group) - grouped_similar_articles.append(group) - - return grouped_similar_articles + return grouped_similar_articles except psycopg2.Error as e: print(f"Error: {e}") @@ -101,7 +49,6 @@ def processing_similar(): grouped_similar_articles_result = find_and_group_similar_articles() if grouped_similar_articles_result: - for group in grouped_similar_articles_result: articles = [] @@ -112,8 +59,8 @@ def processing_similar(): article = [title, link] articles.append(article) l = len(articles) + if l == 2: - print("2") a_one = articles[0][0] a_two = articles[1][0] @@ -141,7 +88,6 @@ def processing_similar(): modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) - print(tokens) if tokens > 2000: combined_text = f"{text1} {text2}" combined_text = slice_text_at_2k_tokens(combined_text) @@ -150,7 +96,6 @@ def processing_similar(): user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." if l == 3: - print("3") a_one = articles[0][0] a_two = articles[1][0] a_three = articles[2][0] @@ -190,13 +135,82 @@ def processing_similar(): modify_similar_data(similar_d, a_three) preparing_articles(False, a_three) - print(tokens) if tokens > 2000: combined_text = f"{text1} {text2} {text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + if l == 4: + print("4") + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + a_four = articles[3][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + get_four = get_specific_data(a_four) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + text4 = get_four[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + link4 = get_four[0][2] + + if link1 != link2: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link2}, {link3}, {link4}" + else: + link = f"{link1}, {link2}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link2}, {link4}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link3}, {link4}" + else: + link = f"{link1}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link4}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + frtoks = num_tokens_from_string(text4) + + tokens = ftoks + stoks + ttoks + frtoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + modify_similar_data(similar_d, a_four) + preparing_articles(False, a_four) + + if tokens > 2000: + combined_text = f"{text1} {text2} {text3} {text4}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field" + else: + user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field." try: completion = client.chat.completions.create( model="gpt-3.5-turbo", @@ -216,16 +230,11 @@ def processing_similar(): except Exception as e: print(f"Error: {e}") - print(f"Title: {a_one}") - print(f"Answer: {generated_text}") + print(a_one) continue + else: + print("Done!.") else: print("No similar articles found.") if __name__=="__main__": processing_similar() -ready = get_ready_data() -if ready: - for a in ready: - print(f"Title: {a[0]}") - print(f"Link: {a[2]}") - print(f"Status: {a[3]}") \ No newline at end of file diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index ac86b52..e939adb 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -1,10 +1,10 @@ from bs4 import BeautifulSoup import requests from urllib.parse import urljoin -from openai import OpenAI , APIError +from openai import OpenAI import os from langchain.embeddings import OpenAIEmbeddings -from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data) +from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing ) import json from dotenv import load_dotenv import tiktoken @@ -39,7 +39,7 @@ def slice_text_at_2k_tokens(text): sliced_tokens = tokens[:max_tokens] sliced_text = encoding.decode(sliced_tokens) - + return sliced_text @@ -82,7 +82,6 @@ def get_article_links(url, already_checked): return link_store - already_checked = set() for dlink in dlinks: @@ -116,8 +115,6 @@ if __name__ == '__main__': title_text = replace_with_spaces(title_text) - - print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}") text_text = slice_text_at_2k_tokens(text_text) text_text = replace_with_spaces(str(text_text)) @@ -138,13 +135,6 @@ if __name__ == '__main__': title = response_data["title"] text = response_data["content"] - #print("*********************************") - #print(f"Title: {title}") - #print("---------------------------------") - #print(f"Content : {text}") - #print("*********************************") - - vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98): diff --git a/pyth/vectData.py b/pyth/vectData.py index 35a642c..e3deda7 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -7,7 +7,6 @@ import os from dotenv import load_dotenv from datetime import datetime ,timedelta - load_dotenv() host = os.getenv("DB_HOST") @@ -27,20 +26,20 @@ conn = psycopg2.connect( def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) v2_normalized = v2 / np.linalg.norm(v2) - similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity -def is_similar_data(title, text, link, embedding, threshold=0.98): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() +def parse_embedding_string(embedding_str): + if isinstance(embedding_str, str): + numbers = [float(num) for num in embedding_str[1:-1].split(',')] + return np.array(numbers) + elif isinstance(embedding_str, np.ndarray): + return embedding_str + else: + raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") +def is_similar_data(title, text, link, embedding, threshold=0.98): + cursor = conn.cursor() cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() @@ -54,12 +53,12 @@ def is_similar_data(title, text, link, embedding, threshold=0.98): similar_d = existing_title insert_data(title,text,link,embedding,similar_d) print(f"Similar data found: \n #{title} \n #{existing_title}") - print(f"Inserting: #{title} \n") + print(f"Inserting: #{title}") similar_d = "NO" cursor.close() return True else: - print(f"Same source of same article!") + print(f"Same article of same source!") cursor.close() return True @@ -68,13 +67,6 @@ def is_similar_data(title, text, link, embedding, threshold=0.98): return False def get_similar(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' cursor.execute(query) @@ -82,73 +74,49 @@ def get_similar(): cursor.close() return similar_data +def get_titles_links_embeddings(): + cursor = conn.cursor() + cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') + data = cursor.fetchall() + cursor.close() + + titles = [row[0] for row in data] + links = [row[1] for row in data] + embeddings = [parse_embedding_string(row[2]) for row in data] + + return titles, links, embeddings def insert_data(title, text, link, embedding, similar_d): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) c_time = datetime.now() - - cursor = conn.cursor() - cursor.execute(''' INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) VALUES (%s, %s, %s, %s, %s ,%s ,%s); ''', (title, text, link, embedding , similar_d, c_time, True)) - conn.commit() - cursor.close() def get_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) + cursor = conn.cursor() query = '''SELECT title,text,link FROM vectorsvevijesti;''' - cursor.execute(query) data = cursor.fetchall() cursor.close() return data def get_ready_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' - cursor.execute(query, ('True',)) data = cursor.fetchall() cursor.close() return data def get_source_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' - cursor.execute(query, ('False',)) data = cursor.fetchall() cursor.close() @@ -156,138 +124,60 @@ def get_source_data(): def modify_similar_data(new_value ,title): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' - cursor.execute(query, (new_value, title)) - conn.commit() def preparing_articles(new_value ,title): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' - cursor.execute(query, (new_value, title)) - conn.commit() def get_specific_data(title): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s''' + query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query, (title,)) - specific_post = cursor.fetchall() cursor.close() return specific_post + def get_all_links(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT link FROM vectorsvevijesti''' cursor.execute(query) - db_links = {link[0] for link in cursor.fetchall()} cursor.close() return db_links def delete_specific(title): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' - cursor.execute(query,(title,)) cursor.close() def cleansing(): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - day_long = datetime.now() - timedelta(days=1) - cursor = conn.cursor() - query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' cursor.execute(query,(day_long,)) - conn.commit() cursor.close() def drop_table(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() - query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' cursor.execute(query) - conn.commit() cursor.close() -def create_db(conn): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) +def create_db(): cursor = conn.cursor() - cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") - register_vector(conn) - cursor.execute(''' CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, @@ -298,10 +188,8 @@ def create_db(conn): similar_d VARCHAR, time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ready BOOLEAN - ); ''') - conn.commit() cursor.close() -create_db(conn) +create_db() From 54a41046ce4b7b1a14f14150f003ac38b73d4502 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Mon, 8 Jan 2024 00:28:20 +0100 Subject: [PATCH 5/5] Fixed response/JSON --- .../scrapingsingle.cpython-310.pyc | Bin 4361 -> 4498 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 6449 -> 6449 bytes pyth/articles.py | 5 ++- pyth/scrapingsingle.py | 41 +++++++++--------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 2598ed55e69e99b6c90bdb13b4eeebcdb0e7a35c..b39ce5c530a438802d33a09755610775c64abdd0 100644 GIT binary patch delta 2015 zcmZ8iO;8h89N)LuY&Kt;009!@BM>18C_=yhiXe&~lv~C4Ho|tqh}cmhYU_r+ z=NmI(=o_%&c8k%nr)xD@QKwmLcEU&y30k%#MBT%{ZaWP=Aw~zN+mg`bq-mO@DWv@{fT=Pb z?x$UJAL<4@X$Vq_(#N`yg+a)|(6(&kAjj@QW;YMHCxl2>Kc(HYXGhwC%%ZeSjA0mn z=(gZSU>HV{l5qeIc-mpl+7C^7koIoK_Xz_S#sc3Oqag1df>G#tA{%2sV5~Y$`>X_z*qp@@!YC}I;u z=!I#d(L zc!y5H(nIO7yp7-Gl5`oz(O9~&boF_Y-XJ$n^MQBrMgBt0^buD*zX`R4r95%tP41Mv z0+gDyij!CPsIQOPzEm%o!&bdaOSImA+I1l^*esi@RCVgLZ@%jQOY+D5Zt{|U=f9APqSE{#AW6^sERH;D5zr=}6~R?0 zU@myZ3u=b6^N-YAxI-8{=;U|Q?w)RxUkYc1C?teWgs6BzD6xzvb%CYOn)kiOM72LD z0_zeWBAc0-=}>gr9m`yWQp0RmP%_s_H2?}UD?IlL7((Flk#_E?Ka&prRiMB9O>l6w z0yVg~$>t{ci@^KMR!2lwC!g1nEd^v96+t0M(cTze*J>oof7g}^Z@m^exWP5^X2~*Z z8_tvn68EbT4WEd3WDK6+7lS{MIX)PAmrQaNI+2?dfqpNDi6}QD_|o;Vbq(HrPyR>f zI623A!o`)>Rr4~-ix9_!PzGlmXsp)l%{c=_M@C3U?7T2(rIKAXYo!u9ield9 z%IgI@<9H@m4pn$rDgI-4dcsTZ_b>`9rgo=`;(N81hmSNJ`A$Gm-R>izu)U1zbYd z>yz~e#p~`_rw~PRmSCR-(q_X+}tJ+Gv3IiJ>4i)OJFWnv%zu76>!*Pk-x-$i%zwQY059Vz7=PZiH_Mk z@6|I~u3UHAgjKFpR@d=nM&b2NF~67VI;=Jp%Z)MI~Eq6Y6ZkPvWO?FS|;}GhJb&sx8@gw&gV3k>oaQ&CiahI1c>ovV*+Lb z6!_nIzj{uvr+Hs&IV;A79T#v$fOpGh1$Pp`RlTq0uu=XXHlX+#jSc=hc7&ApU$J3w znJ42jTW>yz$BFOaYP0ymaLs}_f|BzHAtE&gSzyu|YQd?pN7{hGiwNN zXd(qw>H!3-Pz97KL29K|s)X8mrJj4?(#!q_m3r$P_02j^nrMGB@AuyC&70l#${Y7fVNFo3c`q; zCBipDFDdKdVIDCl>*iBD20h$>-gTA7dE#D>?dQ$Bazz3;9G?c@nM*MPWT995i>7>k}WCx;C+Q1 zgJb1IPEw@&Vfi>)f+e=J39UOPUOB*J_)(!>u=6?J76d@4OB1MR-ue zsup;jSL;x@_3N`hji#D$ZoD8ygXQT^DWa@W`jLo~n+{%%ccRx(5yI4?52+U zQ}EnK0!j0A6Cs(PXvMuqN@$nRhTv%&Fc)Op0WD2C+*8^}Bqg0bbh>x6?w($>ofZU@ zsx+*GX;jhh2~$-OX}7ECFPd_LdVgC(B9n#|N>G8@HL*D99@Wp!_?{7&Ru<~D3aD9y zBIGyB`n9}M!>u!p4o5>!X$Kx7#@yfa3LSJOL#xy86(r@Mb@O)KGAq}e87VRlG^#5) zjmm!otD19v4!x#F-7mvu>6H6N_{2z73WL5#NR=0s`~0nfbrTwQZB*~hMve#05aRyU z)b2ir^yc54ol(S)6sl$&?Dr*-3@tcNFIMf^;uSO<1tCeqS?SXAdAnd%@_BIt%}l^k zu1(`Jh0kGez>P#NWc|Fsx>>jWSBq4K7cA3hWVX=$M`rFI;*#{ynrlQ;bK(;;#CZu9 z5ccX6J(74=z5n8USyl3%6Zh}vQ~JdHDz>H;oW8IK%E86hhZ3+@vmy(xlHp;R3&X%#A^;x)Yg zIJ_GgAwRRH--2R&8HYw-a}uFT2uYAX5h5ZXDj|m8sb!~H!Cv(TLdY-fH?ybVMB#{t zxD3*YmWic%?rc1{x+s}|Qx{&qtkgx9l%yq$Ntlx`f#5~z)q25dw3BBPtCodj0nb-p z!x7jBM4WV=##e`B(}-gdmL>RacSY`wBY2ws@f int: encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(string)) def slice_text_at_2k_tokens(text): encoding_name = "gpt-3.5-turbo" - max_tokens = 2000 - + max_tokens = 1950 encoding = tiktoken.encoding_for_model(encoding_name) tokens = encoding.encode(text) - if len(tokens) <= max_tokens: return [text] - sliced_tokens = tokens[:max_tokens] sliced_text = encoding.decode(sliced_tokens) - return sliced_text +def slice_title_if_needed(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 100 + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + if len(tokens) <= max_tokens: + return [text] + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + return sliced_text def replace_with_spaces(text): allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) return cleaned_text - def fix_links(links_set): modified_links = set() - for link in links_set: if "www" in link: modified_link = link.replace("www.", "") modified_links.add(modified_link) else: modified_links.add(link) - return modified_links total_links = set() collected_news = set() - def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: @@ -81,25 +81,22 @@ def get_article_links(url, already_checked): already_checked.add(link_value) return link_store - already_checked = set() for dlink in dlinks: temp_links = get_article_links(dlink, already_checked) if temp_links: total_links.update(temp_links) - final_links = {item for item in total_links if item} db_links = set(get_all_links()) new_links = final_links - db_links final_links = new_links final_links = set(final_links) - final_links = fix_links(final_links) if __name__ == '__main__': - + for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') @@ -117,24 +114,26 @@ if __name__ == '__main__': text_text = slice_text_at_2k_tokens(text_text) text_text = replace_with_spaces(str(text_text)) - + + ttk = num_tokens_from_string(text_text) + + if ttk > 1900: + title_text = slice_title_if_needed(title_text) try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."} ] ) generated_text = completion.choices[0].message.content - generated_text = generated_text + generated_text = repair_json(generated_text) response_data = json.loads(generated_text) - title = response_data["title"] text = response_data["content"] - vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98):