Skip to content

Commit

Permalink
switch yxml_ret_t to bitfield
Browse files Browse the repository at this point in the history
HTML parsing may return multiple state change for 1 token
For example: ">" in "<input hidden/>" will return ATTRSTART | ELEMEND
  • Loading branch information
yne committed May 12, 2024
1 parent 7a0167a commit 4f09fad
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 207 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: CI

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: make html2json
- name: publish
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RELEASE_FILES: html2json
run: |
RELEASE_TAG="$(date +%y%m%d)"
curl -s -XPOST -d '{"tag_name": "'$RELEASE_TAG'"}' \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H 'Content-Type: application/json' \
"https://api.github.com/repos/${GITHUB_REPOSITORY}/releases" || :
RELEASE_ID=$(curl -s https://api.github.com/repos/${GITHUB_REPOSITORY}/releases/tags/$RELEASE_TAG | jq .id)
echo tag=$RELEASE_TAG has id=$RELEASE_ID
for RELEASE_FILE in $RELEASE_FILES; do
curl -s -XPOST -T $RELEASE_FILE \
-H "Authorization: token $GITHUB_TOKEN" \
-H "Content-Type:application/octet-stream" \
"https://uploads.github.com/repos/${GITHUB_REPOSITORY}/releases/$RELEASE_ID/assets?name=$RELEASE_FILE" || :
done
23 changes: 17 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Tiny HTML to JSON Converter
Convert any XML/HTML to JsonML

## BUILD

```sh
Expand Down Expand Up @@ -37,17 +38,27 @@ cat test/basic.html | ./html2json | jq .[1].lang
```jsonc
// doctype is ommited
["html",{"lang":"en"},[
["head",{},[
["head", {}, [
["meta", {"charset": "utf-8"} ],
["title", {}, "Basic Example" ],
["title", {}, ["Basic Example"] ],
["link", {"rel": "stylesheet"} ]
]],
["body", {"id": "home"}, [
["input", {"type": "text"}]
["p", {} ,"content"],
["input", {"type": "text"}],
["p", {}, ["content"]]
]]
]]
```

</td></tr>
</table>
</table>

# LIMITATIONS

parsing is done by [yxml](https://dev.yorhel.nl/yxml) with the following changes for HTML support:
- migrate `yxml_ret_t` to bitfield enum so multiple state can be returned (example : parsing `>` in `<p hidden>` will return `ATTREND|ELEMSTART`)
- accept lowercase `<!doctype `
- accept unquoted attribute value `<form method=GET>`
- accept value-less attribute `<p hidden>`
- (HTML5 mode) threat encoutered [void elements](https://developer.mozilla.org/en-US/docs/Glossary/Void_element) as self-closed
- (HTML5 mode) ignore end-tag of void elements (ex: `</img>`)
86 changes: 39 additions & 47 deletions html2json.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,68 +20,60 @@ int die(yxml_ret_t r, yxml_t* x)
fprintf(stderr, "error %i at line %u col %u %s", r, x->line, x->byte, reason);
return -r;
}
void putc_escape(int c)
void putc_escape(char* c)
{
if (c == '\\')
if (*c == '\\')
printf("\\\\");
else if (c == '\r')
else if (*c == '\r')
printf("\\r");
else if (c == '\n')
else if (*c == '\n')
printf("\\n");
else if (c == '\t')
else if (*c == '\t')
printf("\\t");
else
printf("%c", c);
printf("%s", c);
}
int main(int argc, char** argv)
{
yxml_t x;
yxml_ret_t r;
yxml_ret_t old_r, r = YXML_OK;
char buf[BUFSIZE];
setbuf(stdout, NULL);
yxml_init(&x, buf, BUFSIZE);
size_t need_close_attr = 0, need_close_tag = 0, depth = 0, last_elem_depth = -1, need_sep_attr = 0, element_has_text = 0;
size_t depth = 0, last_elem_depth = -1, element_has_text = 0;
for (int c; (c = getchar()) != EOF;) {
r = yxml_parse(&x, c);
if (0 && r)
printf("(%i)", r);
if (r < 0) {
return die(r, &x);
}
// pre-processing (autoclose,...)
if (need_close_attr && (r == YXML_ELEMSTART || r == YXML_CONTENT || r == YXML_ELEMEND)) {
printf("},[");
need_close_attr = 0;
yxml_ret_t tmp = yxml_parse(&x, c);
if (tmp < 0)
return die(tmp, &x);
if (tmp == YXML_OK)
continue;
element_has_text = tmp == YXML_CONTENT && ((c != ' ' && c != '\n' && c != '\r' && c != '\t') || element_has_text);
if (tmp == YXML_CONTENT && !element_has_text){
continue;
}
old_r = r;
r = tmp;

if (r == YXML_ELEMSTART) {
if (last_elem_depth == depth)
printf(",");
printf("[\"%s\",{", x.elem);
depth++;
need_close_attr = 1;
need_sep_attr = 0;
element_has_text = 0;
} else if (r == YXML_ATTRSTART) {
if (need_sep_attr)
printf(",");
printf("\"%s\":\"", x.attr);
need_sep_attr = 1;
} else if (r == YXML_ATTRVAL) {
putc_escape(c);
} else if (r == YXML_ATTREND) {
printf("\"");
} else if (r == YXML_CONTENT) {
int is_space = (c == ' ' || c == '\n' || c == '\r' || c == '\t');
if(!element_has_text && !is_space){printf("\"");}
element_has_text |= !is_space;
if(element_has_text)putc_escape(c);
} else if (r == YXML_ELEMEND) {
if(element_has_text){printf("\"");}
element_has_text=0;
printf("]]");
depth--;
last_elem_depth = depth;
}
// global test, put it first
if (old_r & YXML_CONTENT && !(r & YXML_CONTENT)) printf("\"");

if (r & YXML_ELEMSTART && old_r & YXML_ELEMSTART)printf(", {}, [");
if (r & YXML_ELEMSTART) printf("%s", (last_elem_depth == depth || old_r & YXML_CONTENT)?",":"");//0,1,2,4
if (r & YXML_ELEMSTART) printf("\n%*.s[\"%s\"", 2*(depth++),"",x.elem);
if (r & YXML_ATTRSTART && old_r & YXML_ELEMSTART)printf(", {");
if (r & YXML_ATTRSTART) printf("%s\"%s\":",(old_r & YXML_ATTREND?",":""),x.attr);
if (r & YXML_ATTRVAL && old_r & YXML_ATTRSTART)printf("\"");
if (r & YXML_ATTRVAL) putc_escape(x.data);
if (r & YXML_ATTREND) printf("\"");
if (r & YXML_CONTENT && old_r & YXML_ATTREND) printf("}, [");
if (r & YXML_CONTENT && old_r & YXML_ELEMSTART) printf(", {}, [");// insert empty object for easier JQ
if (r & YXML_CONTENT && !(old_r & YXML_CONTENT)) printf("%s\"",last_elem_depth == depth ? ",":"");//1,2,4,32
if (r & YXML_CONTENT) putc_escape(x.data);
if (r & YXML_ELEMEND && (old_r & YXML_ELEMSTART)) printf(", {}, []");// insert empty child for easeir jq
if (r & YXML_ELEMEND && (old_r & YXML_ATTREND)) printf("}, []");// insert empty child for easeir jq
if (r & YXML_ELEMEND && (old_r & YXML_CONTENT)) printf("]");
if (r & YXML_ELEMEND && (old_r & YXML_ELEMEND)) printf("]");
if (r & YXML_ELEMEND) printf("]",2*(last_elem_depth = --depth),"");
}
r = yxml_eof(&x);
if (r < 0)
Expand Down
Loading

0 comments on commit 4f09fad

Please sign in to comment.