From ae1c1902dae5dea7eddbbce8f61457cf68d29472 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Tue, 2 Jan 2024 15:00:07 +0100 Subject: [PATCH] Combine similar article --- pyth/.env | 7 + pyth/.gitlab-ci.yml | 21 ++ .../scrapingsingle.cpython-310.pyc | Bin 0 -> 3031 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 0 -> 5433 bytes pyth/requirements.txt | 141 +++++++++++++ pyth/scrapingsingle.py | 186 +++++++++++++++-- pyth/templates/index.html | 23 +++ pyth/templates/one.html | 12 ++ pyth/templates/two.html | 12 ++ .../test_scrapingsingle.cpython-310.pyc | Bin 0 -> 2416 bytes .../__pycache__/test_vectData.cpython-310.pyc | Bin 0 -> 2808 bytes pyth/tests/test_scrapingsingle.py | 60 ++++++ pyth/tests/test_vectData.py | 89 ++++++++ pyth/vectData.py | 190 +++++++++++++++--- pyth/web-server.py | 24 +++ 15 files changed, 726 insertions(+), 39 deletions(-) create mode 100644 pyth/.env create mode 100644 pyth/.gitlab-ci.yml create mode 100644 pyth/__pycache__/scrapingsingle.cpython-310.pyc create mode 100644 pyth/__pycache__/vectData.cpython-310.pyc create mode 100644 pyth/requirements.txt create mode 100644 pyth/templates/index.html create mode 100644 pyth/templates/one.html create mode 100644 pyth/templates/two.html create mode 100644 pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc create mode 100644 pyth/tests/__pycache__/test_vectData.cpython-310.pyc create mode 100644 pyth/tests/test_scrapingsingle.py create mode 100644 pyth/tests/test_vectData.py create mode 100644 pyth/web-server.py diff --git a/pyth/.env b/pyth/.env new file mode 100644 index 0000000..c213e8f --- /dev/null +++ b/pyth/.env @@ -0,0 +1,7 @@ +OPENAI_API_KEY = "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" + +DB_HOST =localhost +DB_PORT =5432 +DB_USER =postgres +DB_PASSWORD =salmonela pljusti 221 hamo +DB_NAME =svevijestiweb \ No newline at end of file diff --git a/pyth/.gitlab-ci.yml b/pyth/.gitlab-ci.yml new file mode 100644 index 0000000..8cd8989 --- /dev/null +++ b/pyth/.gitlab-ci.yml @@ -0,0 +1,21 @@ +stages: + - test + +variables: + +before_script: + - pip install -r requirements.txt + +test_file1: + stage: test + script: + - python -m pytest tests/test_scrapingsingle.py + only: + - master + +test_file2: + stage: test + script: + - python -m pytest tests/test_vectData.py + only: + - master diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38af9dbd593bcd53feec35a1f82ca687699f9f61 GIT binary patch literal 3031 zcmai0TW=f372aJgmn%x7BuBo;*O6^mqAiiOWLb`^xKV7ub$mtT#_<*|vEm$&OU+(1 zGn6fY0u0D6L1Dlt(5Ey2oj3oJKD0le^V+8XeQTeB!swYLWg97qEHSq;XU?2C^PO)P z7Yca;zi&!^fUkxO;~#qP@i&Nr&#{QRmSIqXn#>4I&op)4VwPuNn_-#I_Uu$zxLxdj?L! z;Ww6d7ECzXoTLZP&N(>OokBl8`N2f*rs-I>L=VzKXlYtoLfLt(K@Zn;|3W%{B%Qy| zpPy;}9>$up&2qCs$LUdej2{29MNiO^aQ-`ko`M;y$@f_;ty^^BmE~Q8i_J^a7&n@8 z&3W%K40jjcGFra^SLo?+!&}7m47QVSm7d+h(_uRai{BaFTm8|wemez|A4z+<2fLr9 z)|Q|Mi!h02UZT^q^eX$>6b=;jWqWwnkBzOiZNRnH=FWs^c*~~IF5q70VR_%`W8(>8 z#9RK|@>aXoJ~N-8WGyYF3qNX{l}uIm0{oT?Hd<^gZYB3ol526+jDrZf&VvM^mD|`G z>Gf8_b)Yne>H=l%!&_fLO~$-rslgx;z-5*C(pSTQsEQyAn9oxI>JGBlr&StDh_+Sn z!Nb-2E4QmF4{ulRtUf7a+QSVg6R}jOY_i~adEIXx+7}3(h)q9;HsW$E4%=h<@^#h( z;cu4pOgsNu+T zxvO83FIoYkmAm(zTweUTL{^f7!DCpz6UfTK{9<|j3Yot1>qqzQ&JY%C0=WgX&A3Ev zG?rVtH<1?n?RUJh>OI2MoxXzv1(szqp1;`~K63?qIorMZCmH8eOwvNb8NY zYjvz$#&fiCa-C9%q~jPgqz$t+Q}jej5VX=-v9A0=*v%o z4H>d>;&Y6n%K2OdH3rJ`AK}&64GtT87}ZL7<#O0+fsjIF>L69F0X_vT)S!^QY>8?u z#-Tp9vL!s1%H0SeTJ;&@87*}D1QA75#-|jmWe{p=*k>HA>{M$Fj4x2Vhe%??ld=V3 zQn`J9gvzrZ+N=tM9Pom+m(>EHY~A!{w|&-vw+5fU`Jb*=8v3ey;jjBqP`L?WQ^rX} zY{Pbd_>sYE=nDzFlI+MvMbx;TXmDd;P)>H#C?2xf-@cmebrj7Cg4K|jH+8X1*D9K> zsY@Pz)(h)DcGmtkKZ8m&6RZ7Chebyrg}@kg8b-%>>g<{wlUp70kJc7uXXipgde2*G%i#F%K{|kLvsBR4u!TnX7F=*%B@yI%$I7F7 zJj$Y$_UKMai=x!k`}*p0$^Du{MF!h`B#9;-9{MtfBeH>nKx6~RM$B0J4B>;*NlPw~M>;M`r0vd- zM;I^c{i#fT17W-kxcLr=;%GL~4xrf(x6}5036#hm4PvZ3^GBkM>$Of58mrkWir~0RxA{zlPD$027Y>2tblpBH& zejNnY(Bg|auoLHh>1*fqPdYLCOJ6&GqZeDUMCyP0tN#A^D}>j8M5Ymqn9=sZRnB9M zK_nXF40M#2Na-_`Sr?a6e&HkNm>NO?VZnNzxpZ@S{HR`QU`~|b z{qw~&?A56cQ`_lFmQsr-C-vTtYsOZt9%u~l91gjwWp-1;Qg9)q#Wh{NxBNkPxA$V{npjC~Clt1{m*3G61RUW~d^p%}f zf>#Y{5c&1J1x)?O4*^sE6fWbVy3%~rOPyo7cMvP(q)0&8)XzMc)fi3F8S@x%Ch#V( z=}Byi96I{p8U&sV`5O1me-L82G~%EJLgo7KNr9t^_#^|@fbQGZe*=Un>@$4&_aFtD z$|gL(fDNVrtKuWWca+n+1~styyatH|figd48O`xMLUflAy)6dUHQ|ajRzrr1{5*uL ztm~1qb*#MQSPn9_YdRU-IwP*7Ib8F05ocXXYZt9Ca|nxtdKPLOGOc&cyJAkyyC3s? VW(>LCeldbI?)1WXXaC){{tKS$N(uk~ literal 0 HcmV?d00001 diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4104298addadbf5270d81b2b0903869d4f76c93f GIT binary patch literal 5433 zcmbVQTW{OQ6()zIL|uJ}H?ebNHa81Z>?Vu0+a#N<6Um6Xv1~|6c30S~Kx@V}6G~Ku zv=bZjlk*xB*w;k>`!Nsw2L%fB7xbBLMIYLrK>N`4rA2|BGnA-{<%LrUbK%VJ_MPuL z=Zu_8CMm)3yT5$L{(44|{)8v}M;uOm0mYq^C5cK@wxt@j#Fm$RPSZ(x3P#S* zDf$t#)ATew1MLi*rZdotRCu9OR6 zy0oxde5VEKrsH~n+IBe9TQ1{)-ZowLrNe0uqiZd*$r=P-xpQdHs!$$6aqmFoOP=&h zJ|MnyD4&z|h%a}}WpS=}icdCVU)hlN36*C6g&Ap2^;N)vlsJ$?%BcZqwS$;#wM=^> zP+Jafa(qTk3&{2@D7S;m_N_XOo3^#fD4gEzKTYS z7nu9pbK3LnHruwIG1s$hHcdo*zP;mZ&Li{%(=+GVJHfezX*afPfVY0c4tQ%}P&)@v|_y3yx=I@_35S{Lkg$}AgAoVxsATA#96i@XuFGl4z+MecX2k}E` zFXqP%bV?3^CD2R2NZgMXq;FUEbYG_mAeQiw2Px>QGl26!8dfCXOoN$}m+_OnvQPHX ze%e=`tK9O5pL(wFU-^l>jFbsZ&hR_jOk)5@m3K` zP3~pjJ;o0~Wx(511y`hTW^nYyO!vySq<@xXz`EHI&%jsj-mVlM7avv)&+=@RXU#Q6 zsny!ZgSnp@i{+JNBP4#$GfKzh?yceYyej*&F3?5d(pFSUTU z9^CW}j1eOlkkU$Z_k0D&Yq-u9Z!p7IH(We5VO_&!mltyrz;t(~;j}ky2l0l}Y5@m0 z8jX)(HICH;R+CT#vBnm69UjElE7sWZSfH;nud!*`Ha~^)n#sAj6U5hnI3Q8Miy&o* z7pvoI2U?q3Eicd-w&Su~HqZnhf$FhWAU#AQNJQv>sy8`vHyxV>D!v@QicdMkUV*QG z53NTl-C+9YVBOnl+bqaNGvTu)qx0zUAQO$@EAX@Um_$cUbt=RZR_AeM%yPSKNqp7E zFzDWgN|j0Ui~UEBCy6fWP$p$vPD0UT6(lnSMTLG$%*AB#&ukWF_@{7-29)SP{dd*# z4}8D~^?>gY>X9iyzjL7bp@L+gAO%K}$Y5UqHYy!9J{@3V^qr%LXngsoSS}j9RX{6a zsbtLFD6QPc8#k6$3QLPi#lnr;yA($#_!X>OPB#hL3Snnr|&aqU27=+MOSB|LNm<>IHfkMa|$ilo4 z@iX5;KMyXX$#T(6pDsLpQmh!W*WA1@Sd9ENH+NSo%w9qiC@tq@&qSbjr`fc; z&`LVb!xDZIt2wOZvHBTSXg*k6$CnE)$0*U$49~&oQz$MfXxLJ^5+hkf{jaX2iCB58 z3i!|A@`%Sb2Y4)?`yq#lk-DF1{4NXxDmtcNg6T*woUmcZaqL2Mqi!F-nj`r^SGVpx zm>t0`;uE1;1$3+W3g}ivW~~#!D1wX|V$!`QtA&MX@hEpH#cGcr#%I8Tj%Ed?!hu}! zBosgahT8lbdXRb~TI^+g+q6NF!uF9&1$M)zooYJNTHom^Qvm40joi~vsbtVOs$`si zxeGAWz$n)}Fnx%?Luf~UCxctVlz{+vkXiwFiZ89pJ?H>Z(s6v4WOW@MWPyEQctC8C zyXn*24?4sixLgoR8--Aiaoen6ty_(H8*#jh)y@3xthHqIBFMx08@s3r9 z`)D*|$HWP$Q1vzN4*6D)_GN&xKVoQ|h;g|C-0|`nupxnWz~EJZ;!Z)e36K9awg+~j z(v0*Sc|pJ`vPf|RCXmdNxgUpKWzcg3iF+<28BVi({~g{9Z#^aL8i*Vu9(8;pckZ6; zK*s9cpNkfUM(GPIq1tu27G5%Jip;o-;}=KFMNa%b%pDT_ z%>pnML_ab|JRMQTlBC>e*FAI1W`96nrlA^29dwmIVvVq@AT0fH1RX^L z%VRhhMYwMA3;=Yto?gNSHwZ}`jwoU?DkvyMM4Bf+Qag_E+QV`YgL3h!YOz#VS}7q^ zm?00qWqty07UgW6OCn^CEbr|CjoIib%pxsntXUhd&$CS1SS>FtFOFBp;290E>@}w%d1AA zxVZ4-an*SEq+BkRs`Y3l`iySknm&mbuYz&ItOcdeCqdFju;HlE;0*uWplg4GYeu?y z*u4=bX&@Kw5FlJ##O8ANN=UC@qP-FB+vmEwXpXNCOuNrP(!4d-WFEH~?p)in_)7~6 zhi)*?T3gNbPPk_#jtl=WiGQ>>raoM_79?&EgOCJ&bREByx`OLPN{0Wyi2bA3>WCa# zq@^Og5($F{SHdj|4B+c$RUg++Lz#dwu8-+c`b7Ht{{ZQGt int: + encoding = tiktoken.encoding_for_model(model) + return len(encoding.encode(string)) + +def slice_text_at_2k_tokens(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 2000 + + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + + if len(tokens) <= max_tokens: + return [text] + + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + + return sliced_text + + +def replace_with_spaces(text): + allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " + cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) + return cleaned_text + total_links = set() collected_news = set() + def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: @@ -36,6 +68,8 @@ def get_article_links(url, already_checked): already_checked.add(link_value) return link_store + + already_checked = set() for dlink in dlinks: @@ -44,8 +78,17 @@ for dlink in dlinks: total_links.update(temp_links) final_links = {item for item in total_links if item} +i = 0 -for link in final_links: +db_links = set(get_all_links()) +new_links = final_links - db_links +final_links = new_links + + + +if __name__ == '__main__': + + for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') @@ -54,6 +97,16 @@ for link in final_links: texts = soup.find_all(['p']) text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + text_text = text_text + title_text = title_text + + title_text = replace_with_spaces(title_text) + + + print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}") + text_text = slice_text_at_2k_tokens(text_text) + text_text = replace_with_spaces(str(text_text)) try: completion = client.chat.completions.create( @@ -65,23 +118,130 @@ for link in final_links: ) generated_text = completion.choices[0].message.content + generated_text = generated_text + response_data = json.loads(generated_text) title = response_data["title"] text = response_data["content"] - print("*********************************") - print(f"Title: {title}") - print("---------------------------------") - print(f"Content : {text}") - print("*********************************") + #print("*********************************") + #print(f"Title: {title}") + #print("---------------------------------") + #print(f"Content : {text}") + #print("*********************************") vector = embeddings.embed_query(generated_text) - - if not is_similar_data(title, text, link, vector, threshold=0.9): - insert_data(title, text, link, vector) + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector,similar_d) + except Exception as e: print(f"Error in completion: {e}") continue + +def comb_similar(): + + print("Checking similar") + similar_article = get_similar() + + grouped_data = {} + + + for sa in similar_article: + if similar_article: + first_t = get_specific_data(sa[0]) + second_t = get_specific_data(sa[1]) + link_f = first_t[0][2] + link_s = second_t[0][2] + f_text = first_t[0][1] + s_text = second_t[0][1] + f_title = first_t[0][0] + s_title = second_t[0][0] + + if f_title in grouped_data: + grouped_data[f_title].append((f_text, link_f)) + else: + grouped_data[f_title] = [(f_text, link_f)] + + if s_title in grouped_data: + grouped_data[s_title].append((s_text, link_s)) + else: + grouped_data[s_title] = [(s_text, link_s)] + + for title, tuples in grouped_data.items(): + if len(tuples) == 3: + text1, link1 = tuples[0] + text2, link2 = tuples[1] + text3, link3 = tuples[2] + + t1check = num_tokens_from_string(text1) + t2check = num_tokens_from_string(text2) + t3check = num_tokens_from_string(text3) + slice_if_more = t1check,t2check,t3check + if slice_if_more < 2000: + combined_text = f"{text1}{text2}{text3}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" + link = f"{link1} {link2} {link3}" + + else: + user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." + link = f"{link1} {link2} {link3}" + + else: + ftcheck = num_tokens_from_string(f_text) + stcheck = num_tokens_from_string(s_text) + fscomb = ftcheck + stcheck + if fscomb <2000: + combined_text = f"{f_text}{s_text}" + user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" + link = f"{link_f} {link_s}" + + else: + user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." + link = f"{link_f} {link_s}" + + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ] + ) + generated_text = completion.choices[0].message.content + generated_text = generated_text + + if similar_article: + if f_title == s_title: + print(f_title) + modify_similar_data(first_t,"SOURCE") + similar_article.remove(sa) + print("Modified") + else: + print(f"Second: {s_title}") + modify_similar_data(first_t,"SOURCE") + modify_similar_data(second_t,"SOURCE") + similar_article.remove(sa) + print("Modified") + else: + print("Similar list is empty") + + response_data = json.loads(generated_text) + title = f_title + text = response_data["content"] + + vector = embeddings.embed_query(generated_text) + + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector, similar_d) + + except Exception as e: + print(f"Error in completion: {e}") + continue + +comb_similar() \ No newline at end of file diff --git a/pyth/templates/index.html b/pyth/templates/index.html new file mode 100644 index 0000000..9b156d8 --- /dev/null +++ b/pyth/templates/index.html @@ -0,0 +1,23 @@ + + + + + + Test Pyth + + +
+
+

Test Title 1

+

Test Text 1

+ First +
+
+

Test Title 2

+

Test Text 2

+ Second +
+
+ + + \ No newline at end of file diff --git a/pyth/templates/one.html b/pyth/templates/one.html new file mode 100644 index 0000000..bcba718 --- /dev/null +++ b/pyth/templates/one.html @@ -0,0 +1,12 @@ + + + + + + Article + + +

Test Title

+

Test Text

+ + \ No newline at end of file diff --git a/pyth/templates/two.html b/pyth/templates/two.html new file mode 100644 index 0000000..bcba718 --- /dev/null +++ b/pyth/templates/two.html @@ -0,0 +1,12 @@ + + + + + + Article + + +

Test Title

+

Test Text

+ + \ No newline at end of file diff --git a/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc b/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab3b6cea01b3884563720a5ef05001b86732ad46 GIT binary patch literal 2416 zcmZ`4%Z?jGaC&Ty$K%I7vv~l?aDZS)>`e$DMNvdZR?&jAiP#8aMPL7wvZ7!1vpG-}9Tk zq{*PVnZT z6Rq(&z?1wGz|$6(L0Bg=qw}KPPhFRSmJDTiK&%TCqqEm zUvN6s>F!u;7vmBLi+4(%-`L#9b~s~cK2SjO@814~C%TY;dow;@O;J!5vqJNHuUj!o z1FmB#b(#nsi!^_rOf$_DmpW!tQ?rn&SfyDisGJcxckXU%-Pnw8+}(`7+W2;|Hq^Sj z+V5Yw{8?`u{!3RsSzll8C#dtQp9_o6VW?FA+=&QtB9{^7K6W4$6|l2v7VdGSH*?Jg zlIpa`69RjWzYZW6`kOFR037XTqTNIHfJ~jEimn1q7u9(6_w ztib{d+0QO(v(B{klfxFKb%2Y%)Q~!G+0qmDz@Pe~MYep}87+;L4;mU$VhSm75KIHM za@sj)!n_J|OP@Gevz#_qbdsHzHqjc3uT5KX94)OKTl#Q;IpDgJTYsT{C(WVGM6aY0 z0tmr%hnHbog7NWl629h5++KsR4dd@q00wZ+eII;DMz%zrkRRNK?zZ#Hy$4xSQ(C6w zGe|1T7tO!ZUDvq1byL%+7W|nbPlJr-oBd&t@jg{_hvsSj7FQ2+QTEjy-%Cdj+w@{a zmFBWv?(1P+14UUFt3=Wg^A?8Sy>j0K*F>sxQe@@z_fSVv_>?ob@wp{!Fr@rn=*;0$hf3&016ybWN|9+djx$GuN3 z>apA@jHmX&v~2RxEvTE2=2YxMRb2?bEXEQRpbHRLxWyl-Q1Y^n;0JJGRThGqdQyN1 zxi-}?gl=+u11_hDhK+*n(OidVzFWx5RzKVU!h!moK&>~Xk!Kg^5#VvY)c*bJ^i`BZZj>cR?4#!?~~jkns`=w7gBDT}?7 z@!3AXX%5G*MTAs^H2iY=&Q@6LjG*C!k5WAh&)cKt?a}jd>&}O}DHlxj!0uXBFfNRr zai!>hD~Yu(F%#vx=-mJVJu_(1nBkyQ_mqUD?--BGdes6Il2f)HCqte*;LJ3s!sfN{ za7=}%2%ej!E;JSM-sVgGaExhZDv-#sg4dlzf&OlqvzQ8Ds-q&!jhBej#v>V0ZJL;e zf@4ooz=A6^#%x%m30I~*x4|k7=T(uL8n#(w0{(ADHcgAiaKGF){!Gu!yxTEV#l^0v zqD2N2R3+TPfeNkNc-D;|$j-L%pyt36$~17AQG%EHna+Uc9Qr3`GhX{xqV3-^4_lMH zJ^MPmk>qCa&5FWgSL*-*vPgV4AT82yS4j(o@47J7i2K5O(WwWnPka)%Ew@FMN%hYO zB;jlQ{6EFI1x??N}M{kG}WE6ukD7zt9}o_c-1cpq0VLkDTF~_ddSG#n@P2!S9dTzwtk-mh~4d zj{h83Jb)orK`@Jv)asTq=~A<{ySAB~t^+eo-OTHHS*2Ua{H~u>yVWe{2E_WoVm5PL zSjd>DB=ItZHzb1+4bU>W;BG8-uyQ8f+ZqaW=sw zVV+=9>=MkAY)#n(ink|NiY`R~F<3roueJXu&Q!PBMoqOQTCCl8mF zo-HiSo5!7pkJ~W;Hx7P?D=_9@$PEz68WHqsXob{Z@C>kRF!%=8F&Ml9>>3RI0rpg7 z!yj536ow7Mf#JgNnD;x(VO0gNs_qcDMGJhXbc7FI6cACCzyK396m<}Lu6EW7skGB8 z1js&Inf7~;lrIXww8vKRDC4b~riIiVTq4gmMFUTdqnJQ3iDC)_9F$ve(OdgeSK=bi zc?=p62oIj_5pBn*+_S`0Sp7XWyIy2`Hj>dwlqa+ETs~JtZ&q&d&13_ZPwr%qR9wvV zw$%Epf*omOxCv_KA;4yOTRMONE(;X{op4U%beTdGErOM4b&f zxN+Ky6K|fekq=&3oF5M3k>_ zfxSVvAao7V0V;3%mwuGCJb_oE-BP8aP@98_bd_f-oUtTd(+*bb(=^>MpWmMTlaSvbf1eg6(oo2IHvU##e0xyUC zGYG9)94ov37NgiR@ot-+~VJLGDZEL>riz61-4nB AbN~PV literal 0 HcmV?d00001 diff --git a/pyth/tests/test_scrapingsingle.py b/pyth/tests/test_scrapingsingle.py new file mode 100644 index 0000000..5afcfda --- /dev/null +++ b/pyth/tests/test_scrapingsingle.py @@ -0,0 +1,60 @@ +import unittest +from unittest.mock import patch +import requests +from bs4 import BeautifulSoup +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores.pgvector import PGVector +from openai import OpenAI +import json +from dotenv import load_dotenv +from scrapingsingle import get_article_links, insert_data, is_similar_data +import os + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + + +already_checked = set() +total_links = set() +collected_news = set() +dlinks = 'http://127.0.0.1:5000/' + +class TestIntegration(unittest.TestCase): + + + def test_integration(self): + link = get_article_links(dlinks,already_checked) + self.assertEqual(len(already_checked), 2) + + for link in total_links: + response = requests.get(link) + soup = BeautifulSoup(response.text, 'html.parser') + + titles = soup.find_all(['h2', 'h1', 'h3']) + title_text = ' '.join([title.get_text(strip=True) for title in titles]) + + texts = soup.find_all(['p']) + text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + ] + ) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + title = response_data["title"] + text = response_data["content"] + + vector = embeddings.embed_query(generated_text) + + self.assertIn("Test Title", title) + self.assertIn("Test Text", text) + self.assertEqual(len(total_links), 2) + diff --git a/pyth/tests/test_vectData.py b/pyth/tests/test_vectData.py new file mode 100644 index 0000000..99d4dd6 --- /dev/null +++ b/pyth/tests/test_vectData.py @@ -0,0 +1,89 @@ +import unittest +import numpy as np +import psycopg2 +import os +from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db + +class TestIntegration(unittest.TestCase): + host = os.getenv("DB_HOST") + port = os.getenv("DB_PORT") + user = os.getenv("DB_USER") + password = os.getenv("DB_PASSWORD") + dbname = os.getenv("DB_NAME") + + @classmethod + def setUpClass(cls): + cls.host = os.getenv("DB_HOST") + cls.port = os.getenv("DB_PORT") + cls.user = os.getenv("DB_USER") + cls.password = os.getenv("DB_PASSWORD") + cls.dbname = os.getenv("DB_NAME") + + cls.conn = psycopg2.connect( + host=cls.host, + port=cls.port, + user=cls.user, + password=cls.password, + dbname=cls.dbname + ) + create_db(cls.conn) + + @classmethod + def tearDownClass(cls): + cls.conn.close() + + def setUp(self): + if self.conn.closed: + self.conn = psycopg2.connect( + host=self.host, + port=self.port, + user=self.user, + password=self.password, + dbname=self.dbname + ) + self.cursor = self.conn.cursor() + + def tearDown(self): + if not self.cursor.closed: + self.cursor.close() + + if not self.conn.closed: + self.conn.close() + + def test_insert_and_retrieve_data(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + data = get_data() + + self.assertEqual(data, [(title, text, link)]) + + def test_is_similar_data_integration(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + def test_create_db_integration(self): + cursor = self.conn.cursor() + cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'") + table_exist = bool(cursor.fetchone()) + self.assertTrue(table_exist) + +if __name__ == '__main__': + unittest.main() diff --git a/pyth/vectData.py b/pyth/vectData.py index dd1e2d7..e99883a 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -3,12 +3,26 @@ from psycopg2 import sql from pgvector.psycopg2 import register_vector from sklearn.metrics.pairwise import cosine_similarity import numpy as np +import os +from dotenv import load_dotenv +from datetime import datetime ,timedelta -host = 'localhost' -port = '5432' -user = 'postgres' -password = 'salmonela pljusti 221 hamo' -dbname = 'vector_svw' + +load_dotenv() + +host = os.getenv("DB_HOST") +port = os.getenv("DB_PORT") +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +dbname = os.getenv("DB_NAME") + +conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) @@ -17,7 +31,7 @@ def calculate_cosine_similarity(v1, v2): similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity -def is_similar_data(title, text, link, embedding, threshold=0.9): +def is_similar_data(title, text, link, embedding, threshold=0.98): conn = psycopg2.connect( host=host, port=port, @@ -27,25 +41,33 @@ def is_similar_data(title, text, link, embedding, threshold=0.9): ) cursor = conn.cursor() - cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;') + cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() for existing_embedding_tuple in existing_embeddings: existing_title = existing_embedding_tuple[0] existing_embedding = np.array(existing_embedding_tuple[1]).flatten() + existing_link = existing_embedding_tuple[2] similarity = calculate_cosine_similarity(existing_embedding, embedding) if similarity > threshold: - print(f"Similar data found: \n #{title} \n #{existing_title}") - cursor.close() - conn.close() - return True + if link != existing_link: + similar_d = existing_title + insert_data(title,text,link,embedding,similar_d) + print(f"Similar data found: \n #{title} \n #{existing_title}") + print(f"Inserting: #{title} \n") + similar_d = "NO" + cursor.close() + return True + else: + print(f"Same source of same article!") + cursor.close() + return True print(f"Inserting: #{title}") cursor.close() - conn.close() return False -def insert_data(title, text, link, embedding): +def get_similar(): conn = psycopg2.connect( host=host, port=port, @@ -53,17 +75,35 @@ def insert_data(title, text, link, embedding): password=password, dbname=dbname ) + cursor = conn.cursor() + query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' + cursor.execute(query) + similar_data = cursor.fetchall() + cursor.close() + return similar_data + + +def insert_data(title, text, link, embedding, similar_d): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + c_time = datetime.now() + + cursor = conn.cursor() cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding) - VALUES (%s, %s, %s, %s); - ''', (title, text, link, embedding)) + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time) + VALUES (%s, %s, %s, %s, %s ,%s); + ''', (title, text, link, embedding , similar_d, c_time)) conn.commit() cursor.close() - conn.close() def get_data(): conn = psycopg2.connect( @@ -79,11 +119,110 @@ def get_data(): cursor.execute(query) data = cursor.fetchall() cursor.close() - conn.close() - return data -def create_db(): +def modify_similar_data(new_value ,title): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' + + cursor.execute(query, (new_value, title)) + + conn.commit() + +def get_specific_data(title): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s''' + cursor.execute(query, (title,)) + + specific_post = cursor.fetchall() + cursor.close() + return specific_post + +def get_all_links(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT link FROM vectorsvevijesti''' + cursor.execute(query) + + db_links = {link[0] for link in cursor.fetchall()} + cursor.close() + return db_links + +def delete_specific(title): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + cursor = conn.cursor() + query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' + + cursor.execute(query,(title,)) + cursor.close() + +def cleansing(): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + day_long = datetime.now() - timedelta(days=1) + + cursor = conn.cursor() + + query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' + cursor.execute(query,(day_long,)) + + conn.commit() + cursor.close() + +def drop_table(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + cursor = conn.cursor() + + query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' + cursor.execute(query) + + conn.commit() + cursor.close() + +def create_db(conn): conn = psycopg2.connect( host=host, port=port, @@ -97,19 +236,18 @@ def create_db(): register_vector(conn) - cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;") - cursor.execute(''' - CREATE TABLE vectorsvevijesti ( + CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, title VARCHAR, text VARCHAR, link VARCHAR, - embedding vector(1536) + embedding vector(1536), + similar_d VARCHAR, + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); ''') conn.commit() cursor.close() - conn.close() -create_db() \ No newline at end of file +create_db(conn) diff --git a/pyth/web-server.py b/pyth/web-server.py new file mode 100644 index 0000000..ae78c2b --- /dev/null +++ b/pyth/web-server.py @@ -0,0 +1,24 @@ +from flask import Flask , render_template , jsonify +from vectData import get_data +from flask_cors import CORS + + +app = Flask(__name__) + +CORS(app) + +@app.route('/') +def index() : + return render_template("index.html") + + +@app.route('/article/one') +def articleone(): + return render_template("one.html") + + +@app.route('/article/two') +def articletwo(): + return render_template("two.html") + +app.run(debug=True) \ No newline at end of file