Skip to content

Commit 8934b1e

Browse files
committed
fix #1190 support to display packet content as UTF-8
1 parent d991fd2 commit 8934b1e

File tree

4 files changed

+181
-50
lines changed

4 files changed

+181
-50
lines changed

configure.ac

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ AC_LBL_C_INIT(V_CCOPT, V_INCLS)
4545
#
4646
AC_CHECK_HEADERS(rpc/rpc.h rpc/rpcent.h)
4747

48+
#
49+
# Check for wide character support for UTF-8 handling
50+
#
51+
AC_CHECK_HEADERS(wchar.h)
52+
AC_CHECK_FUNCS(wcwidth)
53+
4854
# On Linux, if Autoconf version >= 2.72 and GNU C Library version >= 2.34,
4955
# uncomment AC_SYS_YEAR2038_RECOMMENDED to ensure time_t is Y2038-safe.
5056
# (Can be done by autogen.sh)

netdissect.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ struct netdissect_options {
231231
int ndo_Aflag; /* print packet only in ASCII observing TAB,
232232
* LF, CR and SPACE as graphical chars
233233
*/
234+
int ndo_utf8; /* interpret ASCII output as UTF-8 */
234235
int ndo_Hflag; /* dissect 802.11s draft mesh standard */
235236
const char *ndo_protocol; /* protocol */
236237
jmp_buf ndo_early_end; /* jmp_buf for setjmp()/longjmp() */

print-ascii.c

Lines changed: 158 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@
4444

4545
#include <stdio.h>
4646

47+
#ifdef HAVE_WCHAR_H
48+
#include <wchar.h>
49+
#include <wctype.h>
50+
#endif
51+
4752
#include "netdissect-ctype.h"
4853

4954
#include "netdissect.h"
@@ -56,6 +61,71 @@
5661
#define HEXDUMP_HEXSTUFF_PER_LINE \
5762
(HEXDUMP_HEXSTUFF_PER_SHORT * HEXDUMP_SHORTS_PER_LINE)
5863

64+
#ifdef HAVE_WCHAR_H
65+
66+
/*
67+
* The blow is_utf8_printable is taken from ngrep
68+
*
69+
* Check if a UTF-8 character sequence is printable using standard library functions.
70+
* Returns the number of bytes in the UTF-8 character if printable, 0 otherwise.
71+
* Also returns the display width (1 or 2 columns) via the width_out parameter.
72+
*
73+
* This uses mbrtowc() to convert multi-byte UTF-8 to wide char, then iswprint()
74+
* to check if it's printable, and wcwidth() to get the display width.
75+
*/
76+
static int is_utf8_printable(const unsigned char *s, size_t max_len, int *width_out) {
77+
if (!s || max_len == 0) return 0;
78+
79+
mbstate_t state = {0};
80+
wchar_t wc;
81+
82+
size_t len = mbrtowc(&wc, (const char *)s, max_len, &state);
83+
84+
/* Check for errors and incomplete sequences */
85+
if (len == (size_t)-1) {
86+
/* Encoding error */
87+
return 0;
88+
}
89+
90+
if (len == (size_t)-2) {
91+
/* Incomplete multi-byte sequence (need more bytes) */
92+
return 0;
93+
}
94+
95+
if (len == 0) {
96+
/* Null character */
97+
return 0;
98+
}
99+
100+
/* Check if the wide character is printable */
101+
#if defined(_WIN32) || defined(_WIN64)
102+
/* Windows iswprint() is too conservative - be more permissive for UTF-8 */
103+
/* Accept any valid UTF-8 character that's not a control character */
104+
int is_printable = iswprint(wc) ||
105+
(wc >= 0x80 && wc < 0xD800) || /* Most of BMP except surrogates */
106+
(wc >= 0xE000 && wc < 0x110000); /* Private use + supplementary planes */
107+
108+
/* But exclude actual control characters */
109+
if (wc < 0x20 || (wc >= 0x7F && wc < 0xA0)) {
110+
is_printable = 0;
111+
}
112+
#else
113+
int is_printable = iswprint(wc);
114+
#endif
115+
116+
if (is_printable) {
117+
/* Get display width (1 for normal chars, 2 for wide chars like CJK, 0 for combining) */
118+
int w = wcwidth(wc);
119+
if (w < 0) w = 1; /* Treat non-printable/control as width 1 */
120+
/* Note: wcwidth returns 0 for combining characters, which is correct */
121+
if (width_out) *width_out = w;
122+
return (int)len;
123+
}
124+
125+
return 0;
126+
}
127+
#endif
128+
59129
void
60130
ascii_print(netdissect_options *ndo,
61131
const u_char *cp, u_int length)
@@ -71,28 +141,49 @@ ascii_print(netdissect_options *ndo,
71141
truncated = TRUE;
72142
}
73143
ND_PRINT("\n");
74-
while (length != 0) {
75-
s = GET_U_1(cp);
76-
cp++;
77-
length--;
78-
if (s == '\r') {
79-
/*
80-
* Don't print CRs at the end of the line; they
81-
* don't belong at the ends of lines on UN*X,
82-
* and the standard I/O library will give us one
83-
* on Windows so we don't need to print one
84-
* ourselves.
85-
*
86-
* In the middle of a line, just print a '.'.
87-
*/
88-
if (length > 1 && GET_U_1(cp) != '\n')
89-
ND_PRINT(".");
144+
145+
while (length > 0) {
146+
int utf8_len;
147+
int j;
148+
149+
utf8_len = 0;
150+
151+
#ifdef HAVE_WCHAR_H
152+
if (ndo->ndo_utf8) {
153+
utf8_len = is_utf8_printable(cp, length, NULL);
154+
}
155+
#endif
156+
157+
if (utf8_len > 0) {
158+
/* Valid printable UTF-8 character */
159+
for (j = 0; j < utf8_len; j++)
160+
ND_PRINT("%c", cp[j]);
161+
cp += utf8_len;
162+
length -= utf8_len;
163+
90164
} else {
91-
if (!ND_ASCII_ISGRAPH(s) &&
92-
(s != '\t' && s != ' ' && s != '\n'))
93-
ND_PRINT(".");
94-
else
95-
ND_PRINT("%c", s);
165+
s = GET_U_1(cp);
166+
cp++;
167+
length--;
168+
if (s == '\r') {
169+
/*
170+
* Don't print CRs at the end of the line; they
171+
* don't belong at the ends of lines on UN*X,
172+
* and the standard I/O library will give us one
173+
* on Windows so we don't need to print one
174+
* ourselves.
175+
*
176+
* In the middle of a line, just print a '.'.
177+
*/
178+
if (length > 1 && GET_U_1(cp) != '\n')
179+
ND_PRINT(".");
180+
} else {
181+
if (!ND_ASCII_ISGRAPH(s) &&
182+
(s != '\t' && s != ' ' && s != '\n'))
183+
ND_PRINT(".");
184+
else
185+
ND_PRINT("%c", s);
186+
}
96187
}
97188
}
98189
if (truncated)
@@ -104,52 +195,69 @@ hex_and_ascii_print_with_offset(netdissect_options *ndo, const char *indent,
104195
const u_char *cp, u_int length, u_int offset)
105196
{
106197
u_int caplength;
107-
u_int i;
108-
u_int s1, s2;
109-
u_int nshorts;
198+
u_int nbytes_unprinted;
199+
u_int s1;
110200
int truncated = FALSE;
111201
char hexstuff[HEXDUMP_SHORTS_PER_LINE*HEXDUMP_HEXSTUFF_PER_SHORT+1], *hsp;
112-
char asciistuff[ASCII_LINELENGTH+1], *asp;
202+
char asciistuff[ASCII_LINELENGTH+1+4], *asp;
203+
u_int utf8_bytes_to_skip = 0;
113204

114205
caplength = ND_BYTES_AVAILABLE_AFTER(cp);
115206
if (length > caplength) {
116207
length = caplength;
117208
truncated = TRUE;
118209
}
119-
nshorts = length / sizeof(u_short);
120-
i = 0;
210+
nbytes_unprinted = 0;
121211
hsp = hexstuff; asp = asciistuff;
122-
while (nshorts != 0) {
212+
while (length != 0) {
123213
s1 = GET_U_1(cp);
214+
215+
// insert the leading space of short
216+
if ((hsp - hexstuff) % HEXDUMP_HEXSTUFF_PER_SHORT == 0) {
217+
(void)snprintf(hsp, sizeof(hexstuff) - (hsp - hexstuff), " ");
218+
hsp++;
219+
}
220+
221+
// add the byte
222+
(void)snprintf(hsp, sizeof(hexstuff) - (hsp - hexstuff), "%02x", s1);
223+
hsp += 2;
224+
225+
if (utf8_bytes_to_skip > 0) {
226+
// only pad the new line
227+
if (nbytes_unprinted == (asp - asciistuff)) {
228+
*(asp++) = ' ';
229+
}
230+
utf8_bytes_to_skip--;
231+
} else {
232+
// try to add the display (utf8) chars
233+
#ifdef HAVE_WCHAR_H
234+
utf8_bytes_to_skip = ndo->ndo_utf8 ? is_utf8_printable(cp, length, NULL) : 0;
235+
#endif
236+
if (utf8_bytes_to_skip > 0) {
237+
u_int j;
238+
for (j=0; j<utf8_bytes_to_skip; j++) {
239+
*(asp++) = (char)GET_U_1(cp+j);
240+
}
241+
utf8_bytes_to_skip --;
242+
} else {
243+
*(asp++) = (char)(ND_ASCII_ISGRAPH(s1) ? s1 : '.');
244+
}
245+
}
246+
124247
cp++;
125-
s2 = GET_U_1(cp);
126-
cp++;
127-
(void)snprintf(hsp, sizeof(hexstuff) - (hsp - hexstuff),
128-
" %02x%02x", s1, s2);
129-
hsp += HEXDUMP_HEXSTUFF_PER_SHORT;
130-
*(asp++) = (char)(ND_ASCII_ISGRAPH(s1) ? s1 : '.');
131-
*(asp++) = (char)(ND_ASCII_ISGRAPH(s2) ? s2 : '.');
132-
i++;
133-
if (i >= HEXDUMP_SHORTS_PER_LINE) {
248+
nbytes_unprinted++;
249+
if (nbytes_unprinted >= (HEXDUMP_SHORTS_PER_LINE * sizeof(u_short))) {
134250
*hsp = *asp = '\0';
135251
ND_PRINT("%s0x%04x: %-*s %s",
136252
indent, offset, HEXDUMP_HEXSTUFF_PER_LINE,
137253
hexstuff, asciistuff);
138-
i = 0; hsp = hexstuff; asp = asciistuff;
254+
nbytes_unprinted = 0; hsp = hexstuff; asp = asciistuff;
139255
offset += HEXDUMP_BYTES_PER_LINE;
140256
}
141-
nshorts--;
142-
}
143-
if (length & 1) {
144-
s1 = GET_U_1(cp);
145-
cp++;
146-
(void)snprintf(hsp, sizeof(hexstuff) - (hsp - hexstuff),
147-
" %02x", s1);
148-
hsp += 3;
149-
*(asp++) = (char)(ND_ASCII_ISGRAPH(s1) ? s1 : '.');
150-
++i;
257+
length--;
151258
}
152-
if (i > 0) {
259+
260+
if (nbytes_unprinted > 0) {
153261
*hsp = *asp = '\0';
154262
ND_PRINT("%s0x%04x: %-*s %s",
155263
indent, offset, HEXDUMP_HEXSTUFF_PER_LINE,
@@ -159,6 +267,7 @@ hex_and_ascii_print_with_offset(netdissect_options *ndo, const char *indent,
159267
nd_trunc_longjmp(ndo);
160268
}
161269

270+
162271
void
163272
hex_and_ascii_print(netdissect_options *ndo, const char *indent,
164273
const u_char *cp, u_int length)

tcpdump.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ The Regents of the University of California. All rights reserved.\n";
103103
#include <stdlib.h>
104104
#include <string.h>
105105
#include <limits.h>
106+
#include <locale.h>
106107
#ifdef _WIN32
107108
#include <windows.h>
108109
#else
@@ -668,6 +669,7 @@ show_remote_devices_and_exit(void)
668669
#define OPTION_LENGTHS 138
669670
#define OPTION_TIME_T_SIZE 139
670671
#define OPTION_SKIP 140
672+
#define OPTION_UTF8 141
671673

672674
static const struct option longopts[] = {
673675
{ "buffer-size", required_argument, NULL, 'B' },
@@ -712,6 +714,7 @@ static const struct option longopts[] = {
712714
{ "time-t-size", no_argument, NULL, OPTION_TIME_T_SIZE },
713715
{ "ip-oneline", no_argument, NULL, 'g' },
714716
{ "skip", required_argument, NULL, OPTION_SKIP },
717+
{ "utf8", no_argument, NULL, OPTION_UTF8 },
715718
{ "version", no_argument, NULL, OPTION_VERSION },
716719
{ NULL, 0, NULL, 0 }
717720
};
@@ -728,6 +731,12 @@ static const struct option longopts[] = {
728731
#define IMMEDIATE_MODE_USAGE ""
729732
#endif
730733

734+
#ifdef HAVE_WCHAR_H
735+
#define DISPLAY_UTF8_USAGE "[ --utf8 ]"
736+
#else
737+
#define DISPLAY_UTF8_USAGE
738+
#endif
739+
731740
#ifndef _WIN32
732741
/* Drop root privileges and chroot if necessary */
733742
static void
@@ -1631,6 +1640,8 @@ main(int argc, char **argv)
16311640
memset(ndo, 0, sizeof(*ndo));
16321641
ndo_set_function_pointers(ndo);
16331642

1643+
setlocale(LC_CTYPE, "");
1644+
16341645
cnt = -1;
16351646
device = NULL;
16361647
infile = NULL;
@@ -2094,6 +2105,10 @@ main(int argc, char **argv)
20942105
optarg, NULL, 0, INT_MAX, 0);
20952106
break;
20962107

2108+
case OPTION_UTF8:
2109+
++ndo->ndo_utf8;
2110+
break;
2111+
20972112
#ifdef HAVE_PCAP_SET_TSTAMP_PRECISION
20982113
case OPTION_TSTAMP_MICRO:
20992114
ndo->ndo_tstamp_precision = PCAP_TSTAMP_PRECISION_MICRO;
@@ -3507,5 +3522,5 @@ print_usage(FILE *f)
35073522
"\t\t[ --time-stamp-precision precision ] [ --micro ] [ --nano ]\n");
35083523
#endif
35093524
(void)fprintf(f,
3510-
"\t\t" z_FLAG_USAGE "[ -Z user ] [ expression ]\n");
3525+
"\t\t" DISPLAY_UTF8_USAGE z_FLAG_USAGE "[ -Z user ] [ expression ]\n");
35113526
}

0 commit comments

Comments
 (0)