紫龙书3.4.5之KMP算法fail函数

我读的紫龙书是机械工业出版社2009.01之第2版。ISBN:978-7-111-25121-7。译者是南京大学的赵建华,邓滔,戴新宇。

该书74页有个错误:

3)串s的子串(substring)是删除s的某个前缀和某个后缀之后得到的串。例如,bnana,nan和e是bnana的子串。

第一感觉这里有问题。翻开英文原书,这样写的:

3. A substring of s is obtained by deleting any prefix and any suffix from s. For instance, banana, nan, and E are substrings of banana.

就应该只是删除子串嘛,不应该是删除0个或多个符号,成为子序列那样。本书的编辑真该打屁股。

另,3.4.5,

该函数的目标是使得b1b2...bf(s)是最长的既是b1b2...bn的真前缀又是b1b2...bs的后缀的子串。

对应英文是:

The objective is that blb2...bf(s) is the longest proper prefix of b1b2...bs, that is also a suffix of b1b2bs.

读过原文之后,才勉强可理解中文翻译。中文之所以费解,是由于将“最长的”三字依英文位置给前置了。私以为放在后面会好理解一些,如:

该函数的目标是使得b1b2...bf(s)既是b1b2...bn的真前缀又是b1b2...bs的后缀的最长的子串。

就此打住,切入正题。说一下失效函数。

原书中,图3-19给出的fail函数,我使用c语言做出来了。要点是原图中的下标是以1-based,翻译为c语言要调整为0-based。程序如下:

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#include<stdio.h>
#include<string.h>
 
int fail(const char *b)
{
    int n=strlen(b);
    int t=0;
    int f[n+1];
    f[1]=0;
    int s;
    for(s=1;s<=n;s++)
    {
        while(t>0 && b[s]!= b[t])
        {
            t=f[t];
        }
        if (b[s]==b[t])
        {
            t++;
            f[s+1]=t;
        }
        else
        {
            f[s+1]=0;
        }
    }
    for (s=1;s<=n;s++)
    {
        printf("%d ",f[s]);
    }
    printf("\n");
}
 
main()
{
   fail("abababaab");
     
}

运行该程序,解得3.4.3各串的失效函数分别为:

  1. abababaab->0 0 1 2 3 4 5 1 2
  2. aaaaaa->0 1 2 3 4 5
  3. abbaabb->0 0 0 1 1 2 3

 

紫龙书2.6习题

  1. 可忽略如下格式的注释//、#、/*...*/。但是如果在字串中,则无法忽略。如("../*..*/ # //")。
  2. 可识别<,<=,==,!=,>=,>。
  3. 可识别浮点数,格式如:9.22323,.323,22323.33,等等。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/bin/env python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
#rex
#2009.08.22.
 
#不使用正则表达式,自定义下面三个函数,分别判断是否数字/字母,或者二者之一。
def isDigit(d):
    if d>=0 and d<=9 and type(d)==int or d>='0' and d<='9' and type(d)==str:
        return 1
    else: return 0
 
def isLetter(l):
    if l>='a' and l<='z' and type(l)==str or l>='A' and l<='Z' and type(l)==str:
        return 1
    else: return 0
 
def isLetterOrDigit(x):
    if isDigit(x) or isLetter(x):
        return 1
    else: return 0
 
 
 
 
#比较关系
def isComparison(x):
    if x=='<' or x=='>' or x=='!':
        return 1
    else:
        return 0
 
#本函数接收一个字串,
#如果当前是#或//,则返回下一行的内容。
 
def parse_float_num(str):
    index=0
    ch=str[index]
    #print ch,str[index+1:]
    value=0
    weight=10
    while(isDigit(ch) or ch=='.'):       
        if ch=='.':
            weight=1
            index+=1
            if index>len(str)-1:break
            ch=str[index]
        elif isDigit(ch):
            if weight==10:
                value=weight*value+int(ch)
            elif weight<10:
                weight*=0.1
                value=value+weight*int(ch)
            index+=1
            if index>len(str)-1:break
            ch=str[index]
        else: break
         
    return value
 
def new_line(str):
    x=0
    c=str[x]  
    if c=='#':
        #print c, str[x+1]
        while(c!='\n'):
            x+=1
            c=str[x]
        str=str[x+1:]
        return new_line(str)
    elif c=='/' and str[1]=='/':
        while(c!='\n'):
            x+=1
            c=str[x]
        str=str[x+1:]
        return new_line(str)
    elif c=='/' and str[1]=='*':
        x=2
        c=str[x]
        while c:           
            if c=='*' and str[x+1]=='/':    
                return new_line(str[x+2:])           
            x+=1
            c=str[x]
    elif c==' ' or c=='\t' or c=='\n':
        x=0
        while c==' ' or c=='\t' or c=='\n':           
            x+=1
            c=str[x]
        #print "here c:",c
        if str[x+1]:
            return new_line(str[x:])
        else:
            return ""
    else:
        return str
         
 
#定义基类token,以便被继承
class Token:
    tag=0
    def __init__(self,t):
        self.tag=t
    def tokenprint(self):
        print "%s"%self.tag
 
class Tag:
    NUM=256
    ID=257
    TRUE=258
    FALSE=259
    COMPARE=260
 
#继承token,定义num类.
class Num(Token):
    value=0
    def __init__(self,v):
        #注意,此处override了token的自定义.
        Token.__init__(self,Tag.NUM)
        self.value=v
 
    def tokenprint(self):
        #自己的输出函数.
        print self.tag,self.value
         
         
 
#同上.
class Word(Token):
    lexeme=""
    def __init__(self,t,s):
        Token.__init__(self,t)
        self.lexeme=s
    def tokenprint(self):
        print ("%d,%s")%(self.tag,self.lexeme)
 
#同上
class Lexer():
    line=1
    peek=' '
    words={}
    def printall(self):
        #打印所有的保留字、标识符。
        if len(self.words):
            for x in self.words.keys():
                m=self.words[x].tokenprint()
                #忽略其中None的内容。
                if m: print m
 
    def reserve(self,word):
        #之所以选python重写,是因为python里的dictionary与java程序中的Hashtable()是一家.
        self.words[word.lexeme]=word
 
    def __init__(self):
        self.reserve(Word(Tag.TRUE,"true"))
        self.reserve(Word(Tag.FALSE,"false"))
        self.reserve(Word(Tag.COMPARE,"=="))
        self.reserve(Word(Tag.COMPARE,">="))
        self.reserve(Word(Tag.COMPARE,"<="))
        self.reserve(Word(Tag.COMPARE,"!="))
        self.reserve(Word(Tag.COMPARE,">"))
        self.reserve(Word(Tag.COMPARE,"<"))
    def scan(self,str_line):
        buf=str_line
        index=0
         
        char=buf[index]
        while(char):
            if char==' ' or char=='\t':
                index+=1
                char=buf[index]               
            elif char=='#' or char=='/' and buf[index+1]=='/' or char=='/' and buf[index+1]=='*':               
                buf=new_line(buf[index:])
                index=0
                char=buf[index]               
            elif char=='\n':
                self.line+=self.line
                index+=1
                char=buf[index]
            else:    break
             
 
        #python的[]好好用呀!
        buf=buf[index:]
        if isDigit(buf[0]) or buf[0]=='.':           
            #for 也好用.
            v=parse_float_num(buf)
            v=Num(v)
            v.tokenprint()
            return  v
        if isLetter(buf[0]):
            b=""
            for x in buf:
                if isLetterOrDigit(x):
                    b+=x
                else:break
 
            #高级语言的特性
            if b in self.words.keys():
                return self.words[b]
            else:
                w=Word(Tag.ID,b)
                self.words[b]=w
                return w
        if isComparison(buf[0]):
            v=""
            #for 也好用.
            for x in buf:
                if isComparison(x):
                    v+=x
                else:break
            if v in self.words.keys():
                return self.words[v]
            else:
                w=Word(Tag.COMPARE,v)
                self.words[v]=w
                return w          
             
        t=Token(buf[0])
        return t
 
 
 
#测试  .忽略行首连续空白字符;
 
x=Lexer()
 
x.scan("    true")
x.scan("    <>")
x.scan("    /**/<")
x.scan('''    /*
 
*/
 
.123445
''')
x.printall()

 

 

中缀表达式转后缀表达式

最近在看龙书第1版。第二版的例子有的是以java写的。个人喜欢第一版的c程序。

看到第2.6,上面提供了一个中缀转后缀的C语言程序。不过只支持个位数,加减运算。我修改一下,使之增加对多位正整数、乘、除、小括号的支持。允许数字与操作符之间出现任意空白字​符。

理论依据很清晰,就不再写额外的注释了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#include<stdio.h>
#include<ctype.h>
 
/*
中缀表达式
 
理论依据:
expr -> expr + term | expr - term | term
term -> term * factor | term / factor | factor
factor -> digit | ( expr )
 
*/
 
int la; //lookahead
 
 
main()
{
    la=getchar();       
    expr();
    putchar('\n');
}
 
is_space(c)
    int c;
{
    if (c==' ' || c=='\t')
    {
        return 1;
    }
    else return 0;
}
 
space()
{
    while (la==' ' || la=='\t')
    {
        la=getchar();
    }
}
 
expr()
{
    term();
    while(1)
    {
        if (la=='+')
        {
            putchar(' ');
            match('+');term();putchar(' ');putchar('+');
        }
        else if (la=='-')
        {
            putchar(' ');           
            match('-');term();putchar(' ');putchar('-');
        }
        /*
        else if (is_space(la))
        {
            space();
        }
        */
        else break;
    }
}
 
term()
{
    factor();
    while(1)
    {
        if (la=='*')
        {
            putchar(' ');
            match('*');factor();putchar(' ');putchar('*');
        }
        else if (la=='/')
        {
            putchar(' ');
            match('/');factor();putchar(' ');putchar('/');
        }
        /*
        else if (is_space(la))
        {
            space();
        }
        */
        else break;
    }
}
 
factor()
{
    space();
    if (isdigit(la))
    {
        while (isdigit(la))
        {
            putchar(la);
            match(la);
        }   
    }
    else if (la=='(')
    {
        while(la!=')')
        {
            la=getchar();
            expr();
        }
        la=getchar();       
    }
    space();
}
 
match(t)
int t;
{
    if (la==t)
    {
        la=getchar();
    }
    else error();
}
 
error()
{
    printf("syntax error\n");
}