diff --git a/howdoi/howdoi.py b/howdoi/howdoi.py index fc31d0586..610091abc 100755 --- a/howdoi/howdoi.py +++ b/howdoi/howdoi.py @@ -44,6 +44,10 @@ from howdoi import __version__ from howdoi.errors import GoogleValidationError, BingValidationError, DDGValidationError +from .stats import CollectStats + +DEFAULT_DIR = appdirs.user_cache_dir('howdoi-local-stats') + logging.basicConfig(format='%(levelname)s: %(message)s') if os.getenv('HOWDOI_DISABLE_SSL'): # Set http instead of https SCHEME = 'http://' @@ -88,7 +92,7 @@ CACHE_ENTRY_MAX = 128 HTML_CACHE_PATH = 'page_cache' -SUPPORTED_HELP_QUERIES = ['use howdoi', 'howdoi', 'run howdoi', 'setup howdoi', +SUPPORTED_HELP_QUERIES = ['use howdoi', 'howdoi', 'run howdoi', 'do howdoi', 'howdoi howdoi', 'howdoi use howdoi'] # variables for text formatting, prepend to string to begin text formatting. @@ -104,14 +108,15 @@ STASH_REMOVE = 'remove' STASH_EMPTY = 'empty' -BLOCKED_ENGINES = [] - if os.getenv('HOWDOI_DISABLE_CACHE'): # works like an always empty cache cache = NullCache() else: cache = FileSystemCache(CACHE_DIR, CACHE_ENTRY_MAX, default_timeout=0) +ENABLE_USER_STATS = True +# creating object -> initialiing constructor +CollectStats_obj = CollectStats(cache) howdoi_session = requests.session() @@ -187,6 +192,16 @@ def _get_result(url): 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n%s', RED, END_FORMAT) raise error +def _get_from_cache(cache_key): + # As of cachelib 0.3.0, it internally logging a warning on cache miss + current_log_level = logging.getLogger().getEffectiveLevel() + # Reduce the log level so the warning is not printed + logging.getLogger().setLevel(logging.ERROR) + page = cache.get(cache_key) # pylint: disable=assignment-from-none + # Restore the log level + logging.getLogger().setLevel(current_log_level) + return page + def _add_links_to_text(element): hyperlinks = element.find('a') @@ -280,7 +295,7 @@ def _get_links(query): result = None if not result or _is_blocked(result): logging.error('%sUnable to find an answer because the search engine temporarily blocked the request. ' - 'Attempting to use a different search engine.%s', RED, END_FORMAT) + 'Please wait a few minutes or select a different search engine.%s', RED, END_FORMAT) raise BlockError('Temporary block by search engine') html = pq(result) @@ -402,7 +417,7 @@ def _get_links_with_cache(query): question_links = _get_questions(links) cache.set(cache_key, question_links or CACHE_EMPTY_VAL) - + CollectStats_obj.process_links(question_links) return question_links @@ -591,8 +606,8 @@ def howdoi(raw_query): else: args = raw_query - search_engine = args['search_engine'] or os.getenv('HOWDOI_SEARCH_ENGINE') or 'google' - os.environ['HOWDOI_SEARCH_ENGINE'] = search_engine + os.environ['HOWDOI_SEARCH_ENGINE'] = args['search_engine'] or os.getenv('HOWDOI_SEARCH_ENGINE') or 'google' + search_engine = os.getenv('HOWDOI_SEARCH_ENGINE') if search_engine not in SUPPORTED_SEARCH_ENGINES: supported_search_engines = ', '.join(SUPPORTED_SEARCH_ENGINES) message = f'Unsupported engine {search_engine}. The supported engines are: {supported_search_engines}' @@ -605,9 +620,13 @@ def howdoi(raw_query): if _is_help_query(args['query']): return _get_help_instructions() + '\n' - res = cache.get(cache_key) # pylint: disable=assignment-from-none + if ENABLE_USER_STATS: + CollectStats_obj.run(args) + res = _get_from_cache(cache_key) # pylint: disable=assignment-from-none if res: + CollectStats_obj.increase_cache_hits() + CollectStats_obj.process_response(res) logging.info('Using cached response (add -C to clear the cache)') return _parse_cmd(args, res) @@ -622,17 +641,9 @@ def howdoi(raw_query): res = {'error': message} cache.set(cache_key, res) except (RequestsConnectionError, SSLError): - res = {'error': f'Unable to reach {search_engine}. Do you need to use a proxy?\n'} - except BlockError: - BLOCKED_ENGINES.append(search_engine) - next_engine = next((engine for engine in SUPPORTED_SEARCH_ENGINES if engine not in BLOCKED_ENGINES), None) - if next_engine is None: - res = {'error': 'Unable to get a response from any search engine\n'} - else: - args['search_engine'] = next_engine - args['query'] = args['query'].split() - logging.info('%sRetrying search with %s%s', GREEN, next_engine, END_FORMAT) - return howdoi(args) + res = {'error': f'Unable to reach {args["search_engine"]}. Do you need to use a proxy?\n'} + + CollectStats_obj.process_response(res) return _parse_cmd(args, res) @@ -676,6 +687,8 @@ def get_parser(): action='store_true') parser.add_argument('--sanity-check', help=argparse.SUPPRESS, action='store_true') + parser.add_argument('--stats', help='view your local statistics for howdoi', action='store_true') + parser.add_argument('--disable_stats', help='disable local stats collection for howdoi', action='store_true') return parser @@ -765,6 +778,9 @@ def command_line_runner(): # pylint: disable=too-many-return-statements,too-man perform_sanity_check() ) + if args['stats']: + CollectStats_obj.render_stats() + if args['clear_cache']: if _clear_cache(): print(f'{GREEN}Cache cleared successfully{END_FORMAT}') diff --git a/howdoi/stats.py b/howdoi/stats.py new file mode 100644 index 000000000..ff5868763 --- /dev/null +++ b/howdoi/stats.py @@ -0,0 +1,356 @@ +import collections +from datetime import datetime +from termgraph import termgraph +from .utils import get_top_n_key_val_pairs_from_dict, safe_divide +import sys +# GLOBAL VARIABLES - changes for every object hence made +# store the date for first installation +FIRST_INSTALLED = 'dummy' +# permission for dashboard initially set to true +# if set to true only then send the data to dashboard +DASHBOARD_PERMISSION = True +# redundant words +REDUNDANT_WORDS = ['a', 'an', 'the', 'is', 'for', 'on', 'it', 'in'] +HOUR_OF_DAY_KEY = 'DUMMY' +QUERY_WORD_KEY = 'dummy' + +SUCCESS_RESULT_KEY = 'dummy' +# user can choose qny, set by default to index 1 +DATESTRING_FORMATS = ["%Y-%m-%d", "%d-%m-%Y", "%m-%d-%Y"] +# stores the total number of queries done in howdoi +TOTAL_REQUESTS = 'TOTAL_REQUESTS' +SEARCH_ENGINES = "dummy" +# aid in checking for the process links and checking its frequency +PROCESSED_LINKS = "processed links" +CACHE_HITS = "CACHE_HITS" +# variabe for checking if res is errored +ERROR_IN_RES = "ERROR_IN_RES" +# variable for checking if the res is not errored +VALID_RES = "VALID_RES" +# class to show the collected stats +# class RenderStats: +# def __init__(self): +# print("inside graph function") +QUERY_KEY = "query key" +WORD_OF_QUERY = "WORD OF QUERY" + +ERROR_RESULT_KEY= 'dummy' + +# ----------------------------> termgraph DEPENDENCIES +TERMGRAPH_DEFAULT_ARGS = {'filename': '-', 'title': None, 'width': 50, 'format': '{:<5.1f}', 'suffix': '', 'no_labels': False, 'no_values': False, 'color': None, 'vertical': False, 'stacked': False, + 'histogram': False, 'bins': 5, 'different_scale': False, 'calendar': False, 'start_dt': None, 'custom_tick': '', 'delim': '', 'verbose': False, 'label_before': False, 'version': False} + +Report = collections.namedtuple('Report', ['group', 'content']) + + +def draw_graph(data, labels, custom_args = None): + if sys.version>= '3.6': + # create graph using the folloing logic + assert len(data) == len(labels) + if custom_args is None: + custom_args = {} + args = {} + args.update(TERMGRAPH_DEFAULT_ARGS) + args.update(custom_args) + termgraph.chart([], [[datap] for datap in data], args, [str(label) for label in labels]) +# ----------------------------------------> + +class RenderStats: + + def __init__(self, args, colors=[]): + self.termgraph_args = args + self.COLORS = colors + self._report_group_map = collections.OrderedDict() + + def add(self, report): + assert isinstance(report, Report) + if report.group not in self._report_group_map: + self._report_group_map[report.group] = [] + + self._report_group_map[report.group].append(report) + + def render_report(self, report): + if callable(report.content): + report.content() + elif isinstance(report.content, str): + print(report.content) + + def render_report_separator(self, length, separator_char="*"): + separation_string = separator_char*length + print(separation_string) + + def report(self): + for key in self._report_group_map: + self.render_report_separator(70) + for report in self._report_group_map[key]: + self.render_report(report) + +class CollectStats: + + def __init__(self, cache): + self.cache = cache + self.rs = RenderStats(TERMGRAPH_DEFAULT_ARGS) + if not self.cache.has(FIRST_INSTALLED): + self.cache.clear() + # SET FORMAT TO DEFAULT BUT CAN BE CHANGED BY USER + self.cache.set(FIRST_INSTALLED, datetime.today().strftime(DATESTRING_FORMATS[0])) + + def load_time_stats(self): + # get termgraph object instance + rs = self.rs + + days_since_first_install = self.get_days_since_first_install() or 0 + total_request_count = self[TOTAL_REQUESTS] or 0 + + # add total time howdoi was used + rs.add(Report('Time stats', 'You have been using howdoi for {}'.format(days_since_first_install))) + + # add termgraph information about time stats + rs.add(Report('Time stats', 'The average queires made by your are {}'.format(safe_divide(total_request_count, days_since_first_install)))) + + hour_of_day_map = self[HOUR_OF_DAY_KEY] + + if total_request_count > 0 and hour_of_day_map: + most_active_hour_of_the_day = max(hour_of_day_map, key = lambda hour: hour_of_day_map[hour]) + + rs.add(Report('Time stats', 'You are most active between {}:00 and {}:00'.format(most_active_hour_of_the_day, most_active_hour_of_the_day+1))) + + keys, values = [], [] + for k in hour_of_day_map: + lower_time_bound = str(k) + ":00" + upper_time_bound = str(k+1) + ":00" if k+1<24 else "00:00" + keys.append(lower_time_bound+"-"+upper_time_bound) + values.append(hour_of_day_map[k]) + + rs.add( + Report( + 'time-related-stats', lambda: draw_graph(data=values, labels=keys, custom_args={ + 'suffix': ' uses', 'format': '{:<1d}'}) + ) + ) + + def render_search_engine_stats(self): + + rs = self.rs + search_engine_frequency = SEARCH_ENGINES + if search_engine_frequency is not None: + max_search_engine = max(search_engine_frequency, key= lambda engine : search_engine_frequency[engine]) + rs.add(Report('Search-engine=stats', 'Your most used search engine is {}'.format(max_search_engine.title()))) + + se_keys = [] + se_values = [] + # get values for search engine : get stats + for i in search_engine_frequency: + se_keys.append(i) + se_values.append(search_engine_frequency[i]) + + # now add those values to the termgraph + + rs.add(Report('search-engine-stats'), lambda : + draw_graph( + data = se_values, + labels = se_keys, custom_args = {'suffix':'uses', 'format':'{:<1d}'} + )) + + def render_query_stats(self): + rs = self.rs + # get your keys + + query_map = self[QUERY_KEY] + + # get the map of queries + query_words_map = self[QUERY_WORD_KEY] + + # get the top 5 since we are concerned with those + top_5_query_key_vals = get_top_n_key_val_pairs_from_dict(query_map, 5) + + top_5_query_words_key_vals = get_top_n_key_val_pairs_from_dict(query_words_map, 5) + + if len(top_5_query_key_vals) > 0: + most_common_query = top_5_query_key_vals[0][0] + rs.add( + Report( + 'query-stats', 'The query you\'ve made the most times is {}'.format( + most_common_query + ) + ) + ) + if len(top_5_query_words_key_vals) > 0: + most_common_query_word = top_5_query_words_key_vals[0][0] + rs.add( + Report( + 'query-stats', 'The most common word in your queries is {}'.format( + most_common_query_word + ) + ) + ) + + data = [val for _, val in top_5_query_words_key_vals] + labels = [key for key, _ in top_5_query_words_key_vals] + + rs.add( + Report('query-stats', lambda: draw_graph(data=data, labels=labels, + custom_args={'suffix': ' uses', 'format': '{:<1d}'}) + )) + # to store user's most used search engines + # def search_engine_stats(self, search_engine): + # # print("in search engine fun") + # if search_engine: + # search_engines_storage = self.cache.get(SEARCH_ENGINES) + # if search_engines_storage is None: + # search_engines_storage = collections.Counter() + + # search_engines_storage[search_engine] += 1 + # # print(search_engines_storage) + # self.cache.set(SEARCH_ENGINES, search_engines_storage) + # print("working") + + # stores the top queries done with howdoi + # def howdoi_queries_distribution(self): + # print("in howdoi usage") + + def render_request_stats(self): + rs = self.rs + + + total_request_count = self[TOTAL_REQUESTS] or 0 + + + cached_request_count = self[CACHE_HITS] or 0 + + + total_request_count = self[TOTAL_REQUESTS] or 0 + + + outbound_request_count = total_request_count - cached_request_count + + + successful_requests = self[SUCCESS_RESULT_KEY] or 0 + failed_requests = self[ERROR_RESULT_KEY] or 0 + + rs.add( + Report('Network Stats', 'Of the {} requests you have made using howdoi, {} have been saved by howdoi\'s cache'.format( + total_request_count, cached_request_count)) + ) + + rs.add( + Report('Network Stats', 'Also, {} requests have succeeded, while {} have failed due to connection issues, or some other problem.'.format( + successful_requests, failed_requests)) + ) + + if total_request_count > 0: + rs.add( + Report( + 'network-request-stats', lambda: draw_graph( + data=[safe_divide(outbound_request_count*100, total_request_count), + safe_divide(cached_request_count*100, total_request_count)], + labels=['Outbound Requests', 'Cache Saved Requests'], + custom_args={'suffix': '%', } + ) + ) + ) + + if successful_requests+failed_requests > 0: + rs.add( + Report('network-request-stats', lambda: draw_graph( + data=[safe_divide(successful_requests*100, successful_requests+failed_requests), + safe_divide(failed_requests*100, successful_requests+failed_requests)], + labels=['Succesful Requests', 'Failed Requests'], + custom_args={'suffix': '%', } + ) + ) + ) + + + # main fuction for termgraph stats + def render_stats(self): + print("RENDERING STATS, to disable : howdoi --disable_stats") + self.render_search_engine_stats() + self.load_time_stats() + self.render_query_stats() + self.render_request_stats() + self.rs.render() + + def increase_cache_hits(self): + self.cache.inc(CACHE_HITS) + + def increase_key(self, key): + self.cache.inc(key) + + def increase_days_used(self): + # function to inc the number of days howdoi was used + print("working") + current_date = datetime.today().strftime(DATESTRING_FORMATS[0]) + processed_date = str(current_date) + self.increase_key(processed_date) + + def increase_hours_used(self): + current_hour = datetime.now().hour + processed_hour = str(current_hour) + self.increase_key(processed_hour) + # check how many times cache was used and how many times servers were pinged + # def cache_vs_requests_hit(self): + # print("in cache vs hits") + + # <-----------------counter functions ------------------------> + def increase_requests(self): + # print("increasing requests") + print("called") + self.cache.inc(TOTAL_REQUESTS) + # print(self.cache) + + def process_response(self, res): + # checking for error in respomnse + ans = "" + # check for errored response + if not res or (isinstance(res, dict) and res.get('error')): + ans = ERROR_IN_RES + else: + ans = VALID_RES + self.cache.inc(ans) + + def process_links(self, question_links): + print("processing links ") + if not question_links: # checking for empty links + return + links_storage = self.cache.get(PROCESSED_LINKS) + if links_storage is None: + links_storage = collections.Counter() + # increase freq by 1 of the processed link + for i in question_links: + links_storage[i] += 1 + self.cache.set(PROCESSED_LINKS, links_storage) + + def create_storage(self, key, value): + map_storage = self.cache.get(key) + if map_storage is None: + map_storage = collections.Counter() + + map_storage[value]+=1 + self.cache.set(key, map_storage) + + def process_user_query(self, query): + if not query: + return + query = query.strip() + query_storage = self.cache.get(QUERY_KEY) + if query_storage is None: + query_storage = collections.Counter() + + query_storage[query]+=1 + self.cache.set(QUERY_KEY, query_storage) + tokens = query.split(" ") + for token in tokens: + token = token.lower() + if token not in REDUNDANT_WORDS: + self.create_storage(WORD_OF_QUERY, token) + + # main runner calling every function + def run(self, args): + # task 1 -> increase query counter by 1 since used howdoi + self.increase_requests() + self.increase_days_used() + self.search_engine_stats(args.get('search_engine')) + # print("i am working") + self.increase_hours_used() + self.process_user_query(args.get('query')) diff --git a/howdoi/utils.py b/howdoi/utils.py new file mode 100644 index 000000000..f0371b332 --- /dev/null +++ b/howdoi/utils.py @@ -0,0 +1,17 @@ +import heapq + + +def get_top_n_key_val_pairs_from_dict(dict_, N): + top_n_key_value_pairs = [] + if isinstance(dict_, dict): + for key in dict_: + heapq.heappush(top_n_key_value_pairs, (dict_[key], key)) + if len(top_n_key_value_pairs) > N: + heapq.heappop(top_n_key_value_pairs) + + top_n_key_value_pairs.sort(reverse=True) + return [(k, v) for v, k in top_n_key_value_pairs] + + +def safe_divide(numerator, denominator): + return numerator/denominator if denominator != 0 else 0 diff --git a/setup.py b/setup.py index 6550967b2..884f202cd 100644 --- a/setup.py +++ b/setup.py @@ -100,6 +100,7 @@ def read(*names): 'cachelib', 'appdirs', 'keep', + 'termgraph' ], cmdclass={ 'lint': Lint