Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/no hrfp counter #285 #288

Merged
merged 14 commits into from
Dec 2, 2024
50 changes: 50 additions & 0 deletions alembic/versions/ac96222af6fe_hrfp_counters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""hrfp counters

Revision ID: ac96222af6fe
Revises: 30abfd828007
Create Date: 2024-12-02 14:36:21.970968

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'ac96222af6fe'
down_revision: Union[str, None] = '30abfd828007'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('keywords', sa.Column('number_of_changement_climatique_constat_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_changement_climatique_causes_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_changement_climatique_consequences_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_attenuation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_adaptation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_ressources_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_ressources_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_concepts_generaux_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_causes_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_consequences_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_solutions_no_hrfp', sa.Integer(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('keywords', 'number_of_biodiversite_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_consequences_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_causes_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_concepts_generaux_no_hrfp')
op.drop_column('keywords', 'number_of_ressources_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_ressources_no_hrfp')
op.drop_column('keywords', 'number_of_adaptation_climatique_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_attenuation_climatique_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_consequences_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_causes_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_constat_no_hrfp')
# ### end Alembic commands ###
11 changes: 11 additions & 0 deletions postgres/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ class Keywords(Base):
number_of_keywords_climat = Column(Integer) # sum of all climatique counters without duplicate (like number_of_keywords)
number_of_keywords_biodiversite = Column(Integer) # sum of all biodiversite counters without duplicate
number_of_keywords_ressources = Column(Integer) # sum of all ressources counters without duplicate
number_of_changement_climatique_constat_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_constat integer;
number_of_changement_climatique_causes_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_causes_directes integer;
number_of_changement_climatique_consequences_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_consequences integer;
number_of_attenuation_climatique_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_attenuation_climatique_solutions_directes integer;
number_of_adaptation_climatique_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_adaptation_climatique_solutions_directes integer;
number_of_ressources_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_ressources_naturelles_concepts_generaux integer;
number_of_ressources_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_ressources_solutions integer;
number_of_biodiversite_concepts_generaux_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_concepts_generaux integer;
number_of_biodiversite_causes_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
number_of_biodiversite_consequences_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
number_of_biodiversite_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer;

class Channel_Metadata(Base):
__tablename__ = channel_metadata_table
Expand Down
75 changes: 64 additions & 11 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def remove_stopwords(plaintext: str) -> str:
@sentry_sdk.trace
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
keywords_with_timestamp = []
number_of_elements_in_array = 17
number_of_elements_in_array = 28
default_window_in_seconds = DEFAULT_WINDOW_DURATION
plaitext_without_stopwords = remove_stopwords(plaintext)
logging.debug(f"display datetime start {start}")
Expand Down Expand Up @@ -192,8 +192,32 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])

return [

# No high risk of false positive counters
number_of_changement_climatique_constat_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_constat"], \
count_high_risk_false_positive=False)
number_of_changement_climatique_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_causes"], \
count_high_risk_false_positive=False)
number_of_changement_climatique_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_consequences"], \
count_high_risk_false_positive=False)
number_of_attenuation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"], \
count_high_risk_false_positive=False)
number_of_adaptation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"], \
count_high_risk_false_positive=False)
number_of_ressources_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources"], \
count_high_risk_false_positive=False)
number_of_ressources_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources_solutions"], \
count_high_risk_false_positive=False)
number_of_biodiversite_concepts_generaux_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"], \
count_high_risk_false_positive=False)
number_of_biodiversite_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"], \
count_high_risk_false_positive=False)
number_of_biodiversite_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"], \
count_high_risk_false_positive=False)
number_of_biodiversite_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"], \
count_high_risk_false_positive=False)

return [ # Change number_of_elements_in_array if a new element is added here
theme
,keywords_with_timestamp
,number_of_keywords
Expand All @@ -211,8 +235,20 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
,number_of_keywords_climat
,number_of_keywords_biodiversite
,number_of_keywords_ressources
,number_of_changement_climatique_constat_no_hrfp
,number_of_changement_climatique_causes_no_hrfp
,number_of_changement_climatique_consequences_no_hrfp
,number_of_attenuation_climatique_solutions_no_hrfp
,number_of_adaptation_climatique_solutions_no_hrfp
,number_of_ressources_no_hrfp
,number_of_ressources_solutions_no_hrfp
,number_of_biodiversite_concepts_generaux_no_hrfp
,number_of_biodiversite_causes_no_hrfp
,number_of_biodiversite_consequences_no_hrfp
,number_of_biodiversite_solutions_no_hrfp
]
else:
logging.info("Empty keywords")
return [None] * number_of_elements_in_array

def get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds: int = 20):
Expand Down Expand Up @@ -274,6 +310,17 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
,"number_of_keywords_climat"
,"number_of_keywords_biodiversite"
,"number_of_keywords_ressources"
,"number_of_changement_climatique_constat_no_hrfp"
,"number_of_changement_climatique_causes_no_hrfp"
,"number_of_changement_climatique_consequences_no_hrfp"
,"number_of_attenuation_climatique_solutions_no_hrfp"
,"number_of_adaptation_climatique_solutions_no_hrfp"
,"number_of_ressources_no_hrfp"
,"number_of_ressources_solutions_no_hrfp"
,"number_of_biodiversite_concepts_generaux_no_hrfp"
,"number_of_biodiversite_causes_no_hrfp"
,"number_of_biodiversite_consequences_no_hrfp"
,"number_of_biodiversite_solutions_no_hrfp"
]
] = df[['plaintext','srt', 'start']]\
.swifter.apply(\
Expand All @@ -282,9 +329,10 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
result_type='expand'
)

logging.info("Dropping")
# remove all rows that does not have themes
df = df.dropna(subset=['theme'], how='any') # any is for None values

logging.info("Droped")
logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")

return df
Expand All @@ -302,15 +350,21 @@ def add_primary_key(row):
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
def filter_high_risk_false_positive(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: 'hrfp' not in kw, keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None, count_high_risk_false_positive: bool = True) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
logging.debug(f"keywords_with_timestamp is {keywords_with_timestamp}")
if theme is not None:
logging.debug(f"filter theme {theme}")
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))

if count_high_risk_false_positive is False:
keywords_with_timestamp = filter_high_risk_false_positive(keywords_with_timestamp)
logging.debug(f"keywords_with_timestamp is after filtering {keywords_with_timestamp}")
length_filtered_items = len(keywords_with_timestamp)

if length_filtered_items > 0:
Expand Down Expand Up @@ -357,7 +411,9 @@ def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[

if( contains_direct_keywords_same_suject(neighbour_keywords, keyword_info['theme']) ) :
logging.debug(f"Transforming false positive to positive { keyword_info['keyword']} { keyword_info['theme']}")
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
if indirectes in keyword_info['theme']:
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
keyword_info['hrfp'] = True # to store if a keyword was a transformed to a direct keyword

return keywords_with_timestamp

Expand All @@ -383,7 +439,4 @@ def tag_wanted_duration_second_window_number(keywords_with_timestamp: List[dict]
return keywords_with_timestamp

def remove_indirect(theme: str) -> str:
if indirectes in theme:
return theme.replace(f'_{indirectes}', '')
else:
return theme
return theme.replace(f'_{indirectes}', '')
54 changes: 49 additions & 5 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,18 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_solutions_directes \
,new_number_of_keywords_climat \
,new_number_of_keywords_biodiversite \
,new_number_of_keywords_ressources = get_themes_keywords_duration(plaintext, srt, start)
,new_number_of_keywords_ressources \
,number_of_changement_climatique_constat_no_hrfp \
,number_of_changement_climatique_causes_no_hrfp \
,number_of_changement_climatique_consequences_no_hrfp \
,number_of_attenuation_climatique_solutions_no_hrfp \
,number_of_adaptation_climatique_solutions_no_hrfp \
,number_of_ressources_no_hrfp \
,number_of_ressources_solutions_no_hrfp \
,number_of_biodiversite_concepts_generaux_no_hrfp \
,number_of_biodiversite_causes_no_hrfp \
,number_of_biodiversite_consequences_no_hrfp \
,number_of_biodiversite_solutions_no_hrfp = get_themes_keywords_duration(plaintext, srt, start)
except Exception as err:
logging.error(f"continuing loop but met error : {err}")
continue
Expand Down Expand Up @@ -83,10 +94,21 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_causes_directes
,number_of_biodiversite_consequences
,number_of_biodiversite_solutions_directes
,channel_title=channel_title
,number_of_keywords_climat=new_number_of_keywords_climat
,number_of_keywords_biodiversite=new_number_of_keywords_biodiversite
,number_of_keywords_ressources=new_number_of_keywords_ressources
,channel_title
,new_number_of_keywords_climat
,new_number_of_keywords_biodiversite
,new_number_of_keywords_ressources
,number_of_changement_climatique_constat_no_hrfp
,number_of_changement_climatique_causes_no_hrfp
,number_of_changement_climatique_consequences_no_hrfp
,number_of_attenuation_climatique_solutions_no_hrfp
,number_of_adaptation_climatique_solutions_no_hrfp
,number_of_ressources_no_hrfp
,number_of_ressources_solutions_no_hrfp
,number_of_biodiversite_concepts_generaux_no_hrfp
,number_of_biodiversite_causes_no_hrfp
,number_of_biodiversite_consequences_no_hrfp
,number_of_biodiversite_solutions_no_hrfp
)
else: # Program only mode
logging.info(f"Updating program for keyword {keyword_id} - {channel_name} - original tz : {start}")
Expand Down Expand Up @@ -176,6 +198,17 @@ def update_keyword_row(session: Session,
,number_of_keywords_climat: int
,number_of_keywords_biodiversite: int
,number_of_keywords_ressources: int
,number_of_changement_climatique_constat_no_hrfp: int,
number_of_changement_climatique_causes_no_hrfp: int,
number_of_changement_climatique_consequences_no_hrfp: int,
number_of_attenuation_climatique_solutions_no_hrfp: int,
number_of_adaptation_climatique_solutions_no_hrfp: int,
number_of_ressources_no_hrfp: int,
number_of_ressources_solutions_no_hrfp: int,
number_of_biodiversite_concepts_generaux_no_hrfp: int,
number_of_biodiversite_causes_no_hrfp: int,
number_of_biodiversite_consequences_no_hrfp: int,
number_of_biodiversite_solutions_no_hrfp: int
):
if matching_themes is not None:
session.query(Keywords).filter(Keywords.id == keyword_id).update(
Expand All @@ -198,6 +231,17 @@ def update_keyword_row(session: Session,
,Keywords.number_of_keywords_climat: number_of_keywords_climat
,Keywords.number_of_keywords_biodiversite: number_of_keywords_biodiversite
,Keywords.number_of_keywords_ressources: number_of_keywords_ressources
,Keywords.number_of_changement_climatique_constat_no_hrfp:number_of_changement_climatique_constat_no_hrfp ,
Keywords.number_of_changement_climatique_causes_no_hrfp:number_of_changement_climatique_causes_no_hrfp ,
Keywords.number_of_changement_climatique_consequences_no_hrfp:number_of_changement_climatique_consequences_no_hrfp ,
Keywords.number_of_attenuation_climatique_solutions_no_hrfp:number_of_attenuation_climatique_solutions_no_hrfp ,
Keywords.number_of_adaptation_climatique_solutions_no_hrfp:number_of_adaptation_climatique_solutions_no_hrfp ,
Keywords.number_of_ressources_no_hrfp:number_of_ressources_no_hrfp,
Keywords.number_of_ressources_solutions_no_hrfp:number_of_ressources_solutions_no_hrfp ,
Keywords.number_of_biodiversite_concepts_generaux_no_hrfp:number_of_biodiversite_concepts_generaux_no_hrfp ,
Keywords.number_of_biodiversite_causes_no_hrfp:number_of_biodiversite_causes_no_hrfp ,
Keywords.number_of_biodiversite_consequences_no_hrfp:number_of_biodiversite_consequences_no_hrfp ,
Keywords.number_of_biodiversite_solutions_no_hrfp:number_of_biodiversite_solutions_no_hrfp,
},
synchronize_session=False
)
Expand Down
Empty file added secrets/.empty
Empty file.
Loading