htmlを読み込み構文解析するプログラムです。
結果をaccept!までもっていきたいのですが、BEGIN_BODY rejectで止まってしまいます。
html文は変えずにプログラムを変えて、実行結果がaccept!になるようにしたいです。
ちなみにプログラムはの下にあるBNFに従って書いてあります。
prog ::="<HTML>"body"</HTML>"
body ::="<BODY>"{line}"</BODY>"
line ::=table|文字列|"<HR>"|"<BR>"
table ::="<TABLE>"{tr}"</TABLE>"
tr ::=<"TR>{td}"</TR>"
td ::="<TD>"文字列"</TD>" | "<TD>"table<"/TD>"|<"TD>""</TD>"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#define BSIZE 4096
char buff[BSIZE];
char *bp;
enum tokentype
{
BEGIN_HTML, END_HTML, BEGIN_BODY, END_BODY,
BEGIN_TABLE, END_TABLE, BEGIN_TR, END_TR, BEGIN_TD, END_TD,
BR, HR, STRING, END
};
enum tokentype token;
void error(char *msg);
void scan();
void prog();
void body();
void line();
void table();
void tr();
void td();
void error(char *msg)
{
if (msg == NULL)
puts("error");
else
puts(msg);
//////exit(1);
}
void scan()
{
while (isspace(*bp)) {
bp++;
}
if (*bp == '\0') {
token = END;
return;
}
if (strncmp(bp, "<HTML>", 6) == 0) {
token = BEGIN_HTML;
bp += 6;
return;
}
if (strncmp(bp, "</HTML>", 7) == 0) {
token = END_HTML;
bp += 7;
return;
}
if (strncmp(bp, "<BODY>", 6) == 0) {
token = BEGIN_BODY;
bp += 6;
return;
}
if (strncmp(bp, "</BODY>", 7) == 0) {
token = END_BODY;
bp += 7;
return;
}
if (strncmp(bp, "<TABLE>", 7) == 0) {
token = BEGIN_TABLE;
bp += 7;
return;
}
if (strncmp(bp, "</TABLE>", 8) == 0) {
token = END_TABLE;
bp += 8;
return;
}
if (strncmp(bp, "<TR>", 4) == 0) {
token = BEGIN_TR;
bp += 4;
return;
}
if (strncmp(bp, "</TR>", 5) == 0) {
token = END_TR;
bp += 5;
return;
}
if (strncmp(bp, "<TD>", 4) == 0) {
token = BEGIN_TD;
bp += 4;
return;
}
if (strncmp(bp, "</TD>", 5) == 0) {
token = END_TD;
bp += 5;
return;
}
if (strncmp(bp, "<HR>", 4) == 0) {
token = HR;
bp += 4;
return;
}
if (strncmp(bp, "<BR>", 4) == 0) {
token = BR;
bp += 4;
return;
}
if(*bp=='<'){
while(*bp!='>'){
bp++;
}
return;
}
if (isprint(*bp) || isspace(*bp)) {
token = STRING;
bp++;
while ((*bp != '<') && (*bp != '>') && (*bp != '\0')) {
bp++;
}
return;
}
error("unknown token\n");
}
void prog()
{
if (token != BEGIN_HTML)
error("BEGIN_HTML reject");
scan();
body();
if (token != END_HTML)
error("END_HTML reject");
scan();
}
void body()
{
if (token != BEGIN_BODY)
error("BEGIN_BODY reject");
scan();
while (token != END_BODY) {
line();
}
scan();
}
void line()
{
if(token==BEGIN_TABLE){
table();
}else if(token==STRING || token==HR || token==BR){
scan();
}
}
void table()
{
if(token!=BEGIN_TABLE)
error("BEGIN_TABLE reject");
scan();
while(token!=END_TABLE){
tr();
}
scan();
}
void tr()
{
if(token!=BEGIN_TR)
error("BEGIN_TR reject");
scan();
while(token!=END_TD){
td();
}
scan();
}
void td()
{
if (token != BEGIN_TD)
error("BEGIN_TD reject");
scan();
if (token == STRING) {
scan();
}else if (token == BEGIN_TABLE){
table();
}
if (token != END_TD)
error("END_TD reject");
scan();
}
int main()
{
bp = buff;
while (fgets(bp, BSIZE, stdin) != NULL) {
bp += strlen(bp);
}
bp = buff;
scan();
prog();
if (token != END)
error("END reject!");
puts("accept!");
return 0;
}
<!-- Institute of Tech.
Test HTML for htmlchecker -->
<HTML>
<!-- unsupported tag
<HEAD>
<TITLE>
Test Page
</TITLE>
</HEAD>
-->
<BODY>
Sudacci Ponz!!
<HR>
Kana <BR>Institute<BR>of<BR>Technology
<HR>
<TABLE>
<TR><TD>PostScript</TD><TD>UNIX</TD><TD>Mr.President</TD></TR>
<TR><TD>Lenny</TD><TD>Squeeze</TD></TR>
<TR><TD></TD><TD>(*_*)</TD><TD></TD><TD>(^_^)</TD></TR>
</TABLE>
<HR>
<TABLE></TABLE>
GNU/Octave, OctaveForge
<!-- http://www.gnu.org/software/octave/ -->
<TABLE>
<TR>
<TD>Face</TD>
<TD><TABLE><TR><TD>(o_o)</TD></TR><TR><TD>(T_T)</TD></TR></TABLE></TD>
<TD><TABLE><TR><TD>(._.)</TD><TD>(?_?)</TD></TR></TABLE></TD>
</TR>
<TR>
<TD>H/W</TD><TD>S/W</TD><TD>Mr.President</TD>
</TR>
<TR>
<TD>My Favorite Things</TD>
</TR>
</TABLE>
</BODY>
</HTML>