Skip to content

Commit

Permalink
Merge pull request #88 from zen-ayush/main
Browse files Browse the repository at this point in the history
knowledge extraction from open copilot
  • Loading branch information
AvishaiEZen authored Dec 12, 2024
2 parents 7d11098 + 55563a4 commit 1cad3b4
Show file tree
Hide file tree
Showing 5 changed files with 1,616 additions and 0 deletions.
71 changes: 71 additions & 0 deletions src/powerpwn/copilot_studio/modules/deep_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,63 @@ def run_pup_commands(existing_bots: List[str]):
return sort_unique_values_in_file(open_bots_path)
return []

def query_using_pup(open_bots: List[str]):
"""
Execute the Puppeteer JavaScript code for each bot URL given.
The function calls a different JavaScript file.
:param open_bots: The list of bot URLs needed to check
"""
pup_path = get_project_file_path("tools/pup_query_webchat", "query_chat.js")
bots_has_knowledge_path = get_project_file_path("final_results/", "extracted_knowledge.xlsx")

# Delete the existing Excel file to start fresh
if os.path.exists(bots_has_knowledge_path):
os.remove(bots_has_knowledge_path)
logging.debug(f"Deleted existing file: {bots_has_knowledge_path}")

for bot_url in open_bots:
try:
# Construct the shell command
command = f"node {pup_path} {bot_url}"
logging.debug(f"Running command: `{command}`")
# Run the command
subprocess.run(command, shell=True, check=True) # nosec
except subprocess.CalledProcessError as e:
logging.error(f"Error occurred while running Puppeteer: {e}")

if os.path.exists(bots_has_knowledge_path):
# Read the output Excel file and create a dictionary
return parse_chatbot_results(bots_has_knowledge_path)

return {}

def parse_chatbot_results(file_path):
"""
Parses the output Excel file generated by query_chat.js and returns a dictionary.
:param file_path: Path to the output Excel file.
:return: Dictionary with bot URL as key and knowledge info as value.
"""

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

bot_results = {}
for _, row in df.iterrows():
url = str(row.get('URL', '')).strip()
has_knowledge = str(row.get('Has Knowledge', '')).strip()
titles_str = row.get('Titles', '')
titles = []

if pd.notnull(titles_str) and titles_str:
# Split titles by semicolon and strip whitespace
titles = [title.strip() for title in titles_str.split(';')]

bot_results[url] = {
'has_knowledge': has_knowledge,
'titles': titles
}

return bot_results

def camel_case_split(identifier: str):
"""
Expand Down Expand Up @@ -546,6 +603,7 @@ def __init__(self, args):
self.default_solution_prefix = ""
self.existing_bots = []
self.open_bots = []
self.bot_has_knowledge = {}
self.run()

def dump_results(self):
Expand Down Expand Up @@ -729,6 +787,9 @@ def run(self):

print("Done, results saved under final_results/chat_exists_output.txt")

self.bot_has_knowledge = query_using_pup(self.open_bots)
print("Done, extracted knowledge results saved under final_results/extracted_knowledge.xlsx")

else:
logging.error("Did not find a solution publisher prefix")
else:
Expand Down Expand Up @@ -809,6 +870,9 @@ def run(self):

print("Done, results saved under final_results/chat_exists_output.txt")

self.bot_has_knowledge = query_using_pup(self.open_bots)
print("Done, extracted knowledge results saved under final_results/extracted_knowledge.xlsx")

else:
logging.error("Did not find a solution publisher prefix")

Expand Down Expand Up @@ -895,6 +959,9 @@ def run(self):

print("Done, results saved under final_results/chat_exists_output.txt")

self.bot_has_knowledge = query_using_pup(self.open_bots)
print("Done, extracted knowledge results saved under final_results/extracted_knowledge.xlsx")

else:
logging.error("Did not find a default solution publisher prefix")

Expand Down Expand Up @@ -976,7 +1043,11 @@ def run(self):

print("Done, results saved under final_results/chat_exists_output.txt")

self.bot_has_knowledge = query_using_pup(self.open_bots)
print("Done, extracted knowledge results saved under final_results/chat_exists_output.xlsx")

else:
logging.error("Did not find a solution publisher prefix")

self.dump_results()

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
Loading

0 comments on commit 1cad3b4

Please sign in to comment.