-
Notifications
You must be signed in to change notification settings - Fork 0
/
psdecode.py
250 lines (197 loc) · 9.73 KB
/
psdecode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
"""
Script Name: psdecode.py
Author: Justin Lund
Last modified: 05/08/24
Date created: 06/05/23
Version: 1.4
Purpose:
This script is designed to de-obfuscate Powershell code
Dependencies:
- pygments: Used for syntax-highlighted code, for ease of reading
- chardet: Used for detecting the encoding of input files
Usage:
python3 psdecode.py -i obfuscated_powershell.ps1
This will provide you with an interactive menu:
0) Show code sample
1) De-obfuscate PowerShell re-ordering
2) Remove backticks
3) Re-concatenate strings
4) Undo aLtErNaTiNg cApS (Title Case Everything)
5) New lines at semicolons
s) Save code
0 will prompt you as to how many lines of code you want to print to screen.
This is used to check if deobfuscation techniques were successful.
The reason the options are separated and the process isn't 100% automated,
is that the order of techniques applied may need to be changed depending on the code.
Once satisfied with the output, press 's' to save it to a new file.
"""
import re
import argparse
import chardet
from pygments import highlight
from pygments.lexers import PowerShellLexer
from pygments.formatters import TerminalFormatter
# Detect and returns the character encoding of the specified file by reading its first 4096 bytes.
def detect_encoding(file_path):
with open(file_path, 'rb') as file:
rawdata = file.read(4096)
result = chardet.detect(rawdata)
return result['encoding']
# Undo string reordering, ie '{2}{0}{1}'-f'str','ing'some'
def deobfuscate_powershell_reorder(lines):
#---------- Regex patterns for detecting various parts of the PowerShell formatting ----------#
# Defines a character set that includes alphanumeric, whitespace, newline, punctuation, and special characters.
charset = r"\w|\d|\n|\s|,|.|\-|=|/|:|#|_|{|}|\[|\]"
# Matches PowerShell's format string pattern, capturing the order in which arguments are formatted and the string segments.
regex_finder = r"\(\"([{\d+}]+)\"\s*-f\s*(['" + charset + "',]+)\)"
# Captures the numerical positions within the curly braces used in PowerShell format strings to reorder arguments.
regex_position = r"{(\d+)}"
# Captures the actual content within single quotes that are to be reordered using the format specified in regex_position.
regex_content = r"'([" + charset + "]+)'"
# Matches placeholders used to temporarily hold reordered string segments during the de-obfuscation process.
regex_placeholder = r"({#subs_\d+})"
# Matches character codes in the PowerShell script, allowing them to be converted from ASCII codes to actual characters.
regex_char = r"\[char\](\d+)"
# Detects and handles concatenations in PowerShell, capturing and allowing replacement of concatenated strings.
regex_concat = r"\((('|\"[\w|\s|\$]+'|\"\+|.)+)\)"
# Initialize the list to hold de-obfuscated content
new_content = []
# Process each line in the input
for line in lines:
content = line
count_matches = 0 # Track the number of replacements to generate unique placeholders
occurrences = {} # Dictionary to track replacements for placeholders
# Convert ASCII character codes to their respective characters
matches = re.finditer(regex_char, content, re.IGNORECASE)
for _, word in enumerate(matches):
total = word.group()
letter = int(word.groups()[0], 10)
content = content.replace(total, "'" + chr(letter) + "'")
content = content.replace("`", "")
# Main loop to find and replace the formatted strings using the defined patterns
while re.search(regex_finder, content, re.IGNORECASE):
matches = re.finditer(regex_finder, content, re.IGNORECASE)
for _, word in enumerate(matches):
# Extract positions and contents based on the regex matches
positions = re.findall(
regex_position,
word.groups()[0].replace("\n", "").strip(),
re.IGNORECASE,
)
contents = re.findall(
regex_content,
word.groups()[1].replace("\n", "").strip(),
re.IGNORECASE,
)
# Reconstruct the string in the correct order using the positions
if len(positions) == len(contents):
out = ""
for p in positions:
if re.match(regex_placeholder, contents[int(p)]):
out += occurrences["'" + contents[int(p)] + "'"]
else:
out += contents[int(p)].strip()
# Replace the matched pattern with a placeholder and store the reordered string
placeholder = "'{#subs_" + str(count_matches) + "}'"
occurrences[placeholder] = out
content = content.replace(word.group(), placeholder)
count_matches += 1
# Replace all placeholders with their actual values
subs = re.finditer(regex_placeholder, content, re.IGNORECASE)
for _, word in enumerate(subs):
content = content.replace(
"'" + word.group() + "'", occurrences["'" + word.group() + "'"]
)
# Append the processed line to the list of new content
new_content.append(content)
# Return the list of de-obfuscated lines
return new_content
# Remove PowerShell backticks (`) used for line continuation, except at the end of a line.
def remove_ticks(line):
line = line[:-1].replace('`', '') + line[-1]
return line
# Replace concatenated string literals in a line with a single string by removing the '+' operator.
def concatenate(line):
# Define a regex pattern to identify and capture concatenated string literals
# This pattern identifies two quoted strings separated by a '+'
pattern = r"((['\"])([^\2]+)\2\s*\+\s*(['\"])([^\4]+)\4)"
# Search for the first occurrence of concatenated strings in the line
match = re.search(pattern, line)
# Continuously search and replace concatenated strings until none are found
while match:
# Construct the full string by combining the two parts found in the match
full_string = match.group(3) + match.group(5)
# Replace the concatenated part in the line with the full string inside the same type of quotes
line = line.replace(match.group(), match.group(2) + full_string + match.group(2))
# Search again to see if there are more concatenations to process
match = re.search(pattern, line)
# Return the modified line without any string concatenations
return line
# Converts all words in a line to title case for standardization, useful for correcting aLtErNaTiNg cApS for improving readability.
def title_case_line(line):
def title_case_word(match):
word = match.group(0)
return word.title()
return re.sub(r'\b[a-zA-Z0-9_:]+\b', title_case_word, line)
# Apply title case conversion to each line to entire script
def title_case_script(lines):
return [title_case_line(line) for line in lines]
# Replace semicolons with new lines
def add_new_lines_at_semicolons(lines):
new_lines = []
for line in lines:
new_lines.extend(line.replace(';', ';\n').split('\n')) # Add the newline character right after the semicolon
return [line + '\n' for line in new_lines if line] # Ensure every line ends with a newline character
# Main function to run interactive de-obfuscation menu
def main():
parser = argparse.ArgumentParser(description='De-obfuscates PowerShell scripts.')
parser.add_argument('-i', '--input', help='The input PowerShell script file.')
args = parser.parse_args()
# Exit if no input file is provided
if args.input is None:
print("No input file provided. Please provide an input file with the -i option.")
return
# Detect the encoding of the input file
encoding_used = detect_encoding(args.input)
# print(f"Using detected encoding: {encoding_used}") # DEBUGGING - Uncomment to print encoding detection
# Read the file with the detected encoding
with open(args.input, 'r', encoding=encoding_used) as f:
lines = f.readlines()
# Interactive menu loop
while True:
print()
print("0) Show code sample")
print("1) De-obfuscate PowerShell re-ordering")
print("2) Remove backticks")
print("3) Re-concatenate strings")
print("4) Undo aLtErNaTiNg cApS (Title Case Everything)")
print("5) New lines at semicolons")
print("s) Save code")
print("q) Quit")
option = input("Choose an option: ").lower()
if option == "0":
num_lines = int(input("How many lines of the script would you like to print? "))
code_to_print = ''.join(lines[:num_lines])
print(highlight(code_to_print, PowerShellLexer(), TerminalFormatter()))
elif option == "1":
lines = deobfuscate_powershell_reorder(lines)
elif option == "2":
lines = [remove_ticks(line) for line in lines]
elif option == "3":
lines = [concatenate(line) for line in lines]
elif option == "4":
lines = title_case_script(lines)
elif option == "5":
lines = add_new_lines_at_semicolons(lines)
elif option == "s":
output_file = input("Enter the output file name: ")
with open(output_file, 'w', encoding='utf-8') as f:
f.writelines(lines)
break
elif option == "q":
break
else:
print("Invalid option, please try again.")
# Ensure the script can run standalone
if __name__ == "__main__":
main()