435 lines
16 KiB
HTML
435 lines
16 KiB
HTML
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
<HTML><HEAD><TITLE>Man page of PCRE</TITLE>
|
|
</HEAD><BODY>
|
|
<H1>PCRE</H1>
|
|
Section: C Library Functions (3)<BR>Updated: 12 May 2013<BR><A HREF="#index">Index</A>
|
|
<A HREF="/cgi-bin/man/man2html">Return to Main Contents</A><HR>
|
|
|
|
<A NAME="lbAB"> </A>
|
|
<H2>NAME</H2>
|
|
|
|
PCRE - Perl-compatible regular expressions
|
|
<P>
|
|
<B>#include <<A HREF="file:///usr/include/pcre.h">pcre.h</A>></B>
|
|
|
|
<A NAME="lbAC"> </A>
|
|
<H2>PCRE 32-BIT API BASIC FUNCTIONS</H2>
|
|
|
|
|
|
<P>
|
|
<PRE>
|
|
<B>pcre32 *pcre32_compile(PCRE_SPTR32 </B><I>pattern</I>, int <I>options</I>,
|
|
<B> const char **</B><I>errptr</I>, int *<I>erroffset</I>,
|
|
<B> const unsigned char *</B><I>tableptr</I>);
|
|
|
|
<B>pcre32 *pcre32_compile2(PCRE_SPTR32 </B><I>pattern</I>, int <I>options</I>,
|
|
<B> int *</B><I>errorcodeptr</I>,
|
|
<B> const unsigned char *</B><I>tableptr</I>);
|
|
|
|
<B>pcre32_extra *pcre32_study(const pcre32 *</B><I>code</I>, int <I>options</I>,
|
|
<B> const char **</B><I>errptr</I>);
|
|
|
|
<B>void pcre32_free_study(pcre32_extra *</B><I>extra</I>);
|
|
|
|
<B>int pcre32_exec(const pcre32 *</B><I>code</I>, const pcre32_extra *<I>extra</I>,
|
|
<B> PCRE_SPTR32 </B><I>subject</I>, int <I>length</I>, int <I>startoffset</I>,
|
|
<B> int </B><I>options</I>, int *<I>ovector</I>, int <I>ovecsize</I>);
|
|
|
|
<B>int pcre32_dfa_exec(const pcre32 *</B><I>code</I>, const pcre32_extra *<I>extra</I>,
|
|
<B> PCRE_SPTR32 </B><I>subject</I>, int <I>length</I>, int <I>startoffset</I>,
|
|
<B> int </B><I>options</I>, int *<I>ovector</I>, int <I>ovecsize</I>,
|
|
<B> int *</B><I>workspace</I>, int <I>wscount</I>);
|
|
</PRE>
|
|
|
|
<A NAME="lbAD"> </A>
|
|
<H2>PCRE 32-BIT API STRING EXTRACTION FUNCTIONS</H2>
|
|
|
|
|
|
<P>
|
|
<PRE>
|
|
<B>int pcre32_copy_named_substring(const pcre32 *</B><I>code</I>,
|
|
<B> PCRE_SPTR32 </B><I>subject</I>, int *<I>ovector</I>,
|
|
<B> int </B><I>stringcount</I>, PCRE_SPTR32 <I>stringname</I>,
|
|
<B> PCRE_UCHAR32 *</B><I>buffer</I>, int <I>buffersize</I>);
|
|
|
|
<B>int pcre32_copy_substring(PCRE_SPTR32 </B><I>subject</I>, int *<I>ovector</I>,
|
|
<B> int </B><I>stringcount</I>, int <I>stringnumber</I>, PCRE_UCHAR32 *<I>buffer</I>,
|
|
<B> int </B><I>buffersize</I>);
|
|
|
|
<B>int pcre32_get_named_substring(const pcre32 *</B><I>code</I>,
|
|
<B> PCRE_SPTR32 </B><I>subject</I>, int *<I>ovector</I>,
|
|
<B> int </B><I>stringcount</I>, PCRE_SPTR32 <I>stringname</I>,
|
|
<B> PCRE_SPTR32 *</B><I>stringptr</I>);
|
|
|
|
<B>int pcre32_get_stringnumber(const pcre32 *</B><I>code</I>,
|
|
<B> PCRE_SPTR32 </B><I>name</I>);
|
|
|
|
<B>int pcre32_get_stringtable_entries(const pcre32 *</B><I>code</I>,
|
|
<B> PCRE_SPTR32 </B><I>name</I>, PCRE_UCHAR32 **<I>first</I>, PCRE_UCHAR32 **<I>last</I>);
|
|
|
|
<B>int pcre32_get_substring(PCRE_SPTR32 </B><I>subject</I>, int *<I>ovector</I>,
|
|
<B> int </B><I>stringcount</I>, int <I>stringnumber</I>,
|
|
<B> PCRE_SPTR32 *</B><I>stringptr</I>);
|
|
|
|
<B>int pcre32_get_substring_list(PCRE_SPTR32 </B><I>subject</I>,
|
|
<B> int *</B><I>ovector</I>, int <I>stringcount</I>, PCRE_SPTR32 **<I>listptr</I>);
|
|
|
|
<B>void pcre32_free_substring(PCRE_SPTR32 </B><I>stringptr</I>);
|
|
|
|
<B>void pcre32_free_substring_list(PCRE_SPTR32 *</B><I>stringptr</I>);
|
|
</PRE>
|
|
|
|
<A NAME="lbAE"> </A>
|
|
<H2>PCRE 32-BIT API AUXILIARY FUNCTIONS</H2>
|
|
|
|
|
|
<P>
|
|
<PRE>
|
|
<B>pcre32_jit_stack *pcre32_jit_stack_alloc(int </B><I>startsize</I>, int <I>maxsize</I>);
|
|
|
|
<B>void pcre32_jit_stack_free(pcre32_jit_stack *</B><I>stack</I>);
|
|
|
|
<B>void pcre32_assign_jit_stack(pcre32_extra *</B><I>extra</I>,
|
|
<B> pcre32_jit_callback </B><I>callback</I>, void *<I>data</I>);
|
|
|
|
<B>const unsigned char *pcre32_maketables(void);</B>
|
|
|
|
<B>int pcre32_fullinfo(const pcre32 *</B><I>code</I>, const pcre32_extra *<I>extra</I>,
|
|
<B> int </B><I>what</I>, void *<I>where</I>);
|
|
|
|
<B>int pcre32_refcount(pcre32 *</B><I>code</I>, int <I>adjust</I>);
|
|
|
|
<B>int pcre32_config(int </B><I>what</I>, void *<I>where</I>);
|
|
|
|
<B>const char *pcre32_version(void);</B>
|
|
|
|
<B>int pcre32_pattern_to_host_byte_order(pcre32 *</B><I>code</I>,
|
|
<B> pcre32_extra *</B><I>extra</I>, const unsigned char *<I>tables</I>);
|
|
</PRE>
|
|
|
|
<A NAME="lbAF"> </A>
|
|
<H2>PCRE 32-BIT API INDIRECTED FUNCTIONS</H2>
|
|
|
|
|
|
<P>
|
|
<PRE>
|
|
<B>void *(*pcre32_malloc)(size_t);</B>
|
|
|
|
<B>void (*pcre32_free)(void *);</B>
|
|
|
|
<B>void *(*pcre32_stack_malloc)(size_t);</B>
|
|
|
|
<B>void (*pcre32_stack_free)(void *);</B>
|
|
|
|
<B>int (*pcre32_callout)(pcre32_callout_block *);</B>
|
|
</PRE>
|
|
|
|
<A NAME="lbAG"> </A>
|
|
<H2>PCRE 32-BIT API 32-BIT-ONLY FUNCTION</H2>
|
|
|
|
|
|
<P>
|
|
<PRE>
|
|
<B>int pcre32_utf32_to_host_byte_order(PCRE_UCHAR32 *</B><I>output</I>,
|
|
<B> PCRE_SPTR32 </B><I>input</I>, int <I>length</I>, int *<I>byte_order</I>,
|
|
<B> int </B><I>keep_boms</I>);
|
|
</PRE>
|
|
|
|
<A NAME="lbAH"> </A>
|
|
<H2>THE PCRE 32-BIT LIBRARY</H2>
|
|
|
|
|
|
<P>
|
|
Starting with release 8.32, it is possible to compile a PCRE library that
|
|
supports 32-bit character strings, including UTF-32 strings, as well as or
|
|
instead of the original 8-bit library. This work was done by Christian Persch,
|
|
based on the work done by Zoltan Herczeg for the 16-bit library. All three
|
|
libraries contain identical sets of functions, used in exactly the same way.
|
|
Only the names of the functions and the data types of their arguments and
|
|
results are different. To avoid over-complication and reduce the documentation
|
|
maintenance load, most of the PCRE documentation describes the 8-bit library,
|
|
with only occasional references to the 16-bit and 32-bit libraries. This page
|
|
describes what is different when you use the 32-bit library.
|
|
<P>
|
|
|
|
WARNING: A single application can be linked with all or any of the three
|
|
libraries, but you must take care when processing any particular pattern
|
|
to use functions from just one library. For example, if you want to study
|
|
a pattern that was compiled with <B>pcre32_compile()</B>, you must do so
|
|
with <B>pcre32_study()</B>, not <B>pcre_study()</B>, and you must free the
|
|
study data with <B>pcre32_free_study()</B>.
|
|
<A NAME="lbAI"> </A>
|
|
<H2>THE HEADER FILE</H2>
|
|
|
|
|
|
<P>
|
|
There is only one header file, <B>pcre.h</B>. It contains prototypes for all the
|
|
functions in all libraries, as well as definitions of flags, structures, error
|
|
codes, etc.
|
|
<A NAME="lbAJ"> </A>
|
|
<H2>THE LIBRARY NAME</H2>
|
|
|
|
|
|
<P>
|
|
In Unix-like systems, the 32-bit library is called <B>libpcre32</B>, and can
|
|
normally be accesss by adding <B>-lpcre32</B> to the command for linking an
|
|
application that uses PCRE.
|
|
<A NAME="lbAK"> </A>
|
|
<H2>STRING TYPES</H2>
|
|
|
|
|
|
<P>
|
|
In the 8-bit library, strings are passed to PCRE library functions as vectors
|
|
of bytes with the C type "char *". In the 32-bit library, strings are passed as
|
|
vectors of unsigned 32-bit quantities. The macro PCRE_UCHAR32 specifies an
|
|
appropriate data type, and PCRE_SPTR32 is defined as "const PCRE_UCHAR32 *". In
|
|
very many environments, "unsigned int" is a 32-bit data type. When PCRE is
|
|
built, it defines PCRE_UCHAR32 as "unsigned int", but checks that it really is
|
|
a 32-bit data type. If it is not, the build fails with an error message telling
|
|
the maintainer to modify the definition appropriately.
|
|
<A NAME="lbAL"> </A>
|
|
<H2>STRUCTURE TYPES</H2>
|
|
|
|
|
|
<P>
|
|
The types of the opaque structures that are used for compiled 32-bit patterns
|
|
and JIT stacks are <B>pcre32</B> and <B>pcre32_jit_stack</B> respectively. The
|
|
type of the user-accessible structure that is returned by <B>pcre32_study()</B>
|
|
is <B>pcre32_extra</B>, and the type of the structure that is used for passing
|
|
data to a callout function is <B>pcre32_callout_block</B>. These structures
|
|
contain the same fields, with the same names, as their 8-bit counterparts. The
|
|
only difference is that pointers to character strings are 32-bit instead of
|
|
8-bit types.
|
|
<A NAME="lbAM"> </A>
|
|
<H2>32-BIT FUNCTIONS</H2>
|
|
|
|
|
|
<P>
|
|
For every function in the 8-bit library there is a corresponding function in
|
|
the 32-bit library with a name that starts with <B>pcre32_</B> instead of
|
|
<B>pcre_</B>. The prototypes are listed above. In addition, there is one extra
|
|
function, <B>pcre32_utf32_to_host_byte_order()</B>. This is a utility function
|
|
that converts a UTF-32 character string to host byte order if necessary. The
|
|
other 32-bit functions expect the strings they are passed to be in host byte
|
|
order.
|
|
<P>
|
|
|
|
The <I>input</I> and <I>output</I> arguments of
|
|
<B>pcre32_utf32_to_host_byte_order()</B> may point to the same address, that is,
|
|
conversion in place is supported. The output buffer must be at least as long as
|
|
the input.
|
|
<P>
|
|
|
|
The <I>length</I> argument specifies the number of 32-bit data units in the
|
|
input string; a negative value specifies a zero-terminated string.
|
|
<P>
|
|
|
|
If <I>byte_order</I> is NULL, it is assumed that the string starts off in host
|
|
byte order. This may be changed by byte-order marks (BOMs) anywhere in the
|
|
string (commonly as the first character).
|
|
<P>
|
|
|
|
If <I>byte_order</I> is not NULL, a non-zero value of the integer to which it
|
|
points means that the input starts off in host byte order, otherwise the
|
|
opposite order is assumed. Again, BOMs in the string can change this. The final
|
|
byte order is passed back at the end of processing.
|
|
<P>
|
|
|
|
If <I>keep_boms</I> is not zero, byte-order mark characters (0xfeff) are copied
|
|
into the output string. Otherwise they are discarded.
|
|
<P>
|
|
|
|
The result of the function is the number of 32-bit units placed into the output
|
|
buffer, including the zero terminator if the string was zero-terminated.
|
|
<A NAME="lbAN"> </A>
|
|
<H2>SUBJECT STRING OFFSETS</H2>
|
|
|
|
|
|
<P>
|
|
The lengths and starting offsets of subject strings must be specified in 32-bit
|
|
data units, and the offsets within subject strings that are returned by the
|
|
matching functions are in also 32-bit units rather than bytes.
|
|
<A NAME="lbAO"> </A>
|
|
<H2>NAMED SUBPATTERNS</H2>
|
|
|
|
|
|
<P>
|
|
The name-to-number translation table that is maintained for named subpatterns
|
|
uses 32-bit characters. The <B>pcre32_get_stringtable_entries()</B> function
|
|
returns the length of each entry in the table as the number of 32-bit data
|
|
units.
|
|
<A NAME="lbAP"> </A>
|
|
<H2>OPTION NAMES</H2>
|
|
|
|
|
|
<P>
|
|
There are two new general option names, PCRE_UTF32 and PCRE_NO_UTF32_CHECK,
|
|
which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In
|
|
fact, these new options define the same bits in the options word. There is a
|
|
discussion about the
|
|
|
|
|
|
validity of UTF-32 strings
|
|
|
|
in the
|
|
|
|
<B>pcreunicode</B>
|
|
|
|
page.
|
|
<P>
|
|
|
|
For the <B>pcre32_config()</B> function there is an option PCRE_CONFIG_UTF32
|
|
that returns 1 if UTF-32 support is configured, otherwise 0. If this option is
|
|
given to <B>pcre_config()</B> or <B>pcre16_config()</B>, or if the
|
|
PCRE_CONFIG_UTF8 or PCRE_CONFIG_UTF16 option is given to <B>pcre32_config()</B>,
|
|
the result is the PCRE_ERROR_BADOPTION error.
|
|
<A NAME="lbAQ"> </A>
|
|
<H2>CHARACTER CODES</H2>
|
|
|
|
|
|
<P>
|
|
In 32-bit mode, when PCRE_UTF32 is not set, character values are treated in the
|
|
same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
|
|
from 0 to 0x7fffffff instead of 0 to 0xff. Character types for characters less
|
|
than 0xff can therefore be influenced by the locale in the same way as before.
|
|
Characters greater than 0xff have only one case, and no "type" (such as letter
|
|
or digit).
|
|
<P>
|
|
|
|
In UTF-32 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
|
|
the exception of values in the range 0xd800 to 0xdfff because those are
|
|
"surrogate" values that are ill-formed in UTF-32.
|
|
<P>
|
|
|
|
A UTF-32 string can indicate its endianness by special code knows as a
|
|
byte-order mark (BOM). The PCRE functions do not handle this, expecting strings
|
|
to be in host byte order. A utility function called
|
|
<B>pcre32_utf32_to_host_byte_order()</B> is provided to help with this (see
|
|
above).
|
|
<A NAME="lbAR"> </A>
|
|
<H2>ERROR NAMES</H2>
|
|
|
|
|
|
<P>
|
|
The error PCRE_ERROR_BADUTF32 corresponds to its 8-bit counterpart.
|
|
The error PCRE_ERROR_BADMODE is given when a compiled
|
|
pattern is passed to a function that processes patterns in the other
|
|
mode, for example, if a pattern compiled with <B>pcre_compile()</B> is passed to
|
|
<B>pcre32_exec()</B>.
|
|
<P>
|
|
|
|
There are new error codes whose names begin with PCRE_UTF32_ERR for invalid
|
|
UTF-32 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
|
|
are described in the section entitled
|
|
|
|
|
|
"Reason codes for invalid UTF-8 strings"
|
|
|
|
in the main
|
|
|
|
<B>pcreapi</B>
|
|
|
|
page. The UTF-32 errors are:
|
|
<P>
|
|
<BR> PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
|
<BR> PCRE_UTF32_ERR2 Non-character
|
|
<BR> PCRE_UTF32_ERR3 Character > 0x10ffff
|
|
<A NAME="lbAS"> </A>
|
|
<H2>ERROR TEXTS</H2>
|
|
|
|
|
|
<P>
|
|
If there is an error while compiling a pattern, the error text that is passed
|
|
back by <B>pcre32_compile()</B> or <B>pcre32_compile2()</B> is still an 8-bit
|
|
character string, zero-terminated.
|
|
<A NAME="lbAT"> </A>
|
|
<H2>CALLOUTS</H2>
|
|
|
|
|
|
<P>
|
|
The <I>subject</I> and <I>mark</I> fields in the callout block that is passed to
|
|
a callout function point to 32-bit vectors.
|
|
<A NAME="lbAU"> </A>
|
|
<H2>TESTING</H2>
|
|
|
|
|
|
<P>
|
|
The <B>pcretest</B> program continues to operate with 8-bit input and output
|
|
files, but it can be used for testing the 32-bit library. If it is run with the
|
|
command line option <B>-32</B>, patterns and subject strings are converted from
|
|
8-bit to 32-bit before being passed to PCRE, and the 32-bit library functions
|
|
are used instead of the 8-bit ones. Returned 32-bit strings are converted to
|
|
8-bit for output. If both the 8-bit and the 16-bit libraries were not compiled,
|
|
<B>pcretest</B> defaults to 32-bit and the <B>-32</B> option is ignored.
|
|
<P>
|
|
|
|
When PCRE is being built, the <B>RunTest</B> script that is called by "make
|
|
check" uses the <B>pcretest</B> <B>-C</B> option to discover which of the 8-bit,
|
|
16-bit and 32-bit libraries has been built, and runs the tests appropriately.
|
|
<A NAME="lbAV"> </A>
|
|
<H2>NOT SUPPORTED IN 32-BIT MODE</H2>
|
|
|
|
|
|
<P>
|
|
Not all the features of the 8-bit library are available with the 32-bit
|
|
library. The C++ and POSIX wrapper functions support only the 8-bit library,
|
|
and the <B>pcregrep</B> program is at present 8-bit only.
|
|
<A NAME="lbAW"> </A>
|
|
<H2>AUTHOR</H2>
|
|
|
|
|
|
<P>
|
|
<PRE>
|
|
Philip Hazel
|
|
University Computing Service
|
|
Cambridge CB2 3QH, England.
|
|
</PRE>
|
|
|
|
<A NAME="lbAX"> </A>
|
|
<H2>REVISION</H2>
|
|
|
|
|
|
<P>
|
|
<PRE>
|
|
Last updated: 12 May 2013
|
|
Copyright (c) 1997-2013 University of Cambridge.
|
|
</PRE>
|
|
|
|
<P>
|
|
|
|
<HR>
|
|
<A NAME="index"> </A><H2>Index</H2>
|
|
<DL>
|
|
<DT id="1"><A HREF="#lbAB">NAME</A><DD>
|
|
<DT id="2"><A HREF="#lbAC">PCRE 32-BIT API BASIC FUNCTIONS</A><DD>
|
|
<DT id="3"><A HREF="#lbAD">PCRE 32-BIT API STRING EXTRACTION FUNCTIONS</A><DD>
|
|
<DT id="4"><A HREF="#lbAE">PCRE 32-BIT API AUXILIARY FUNCTIONS</A><DD>
|
|
<DT id="5"><A HREF="#lbAF">PCRE 32-BIT API INDIRECTED FUNCTIONS</A><DD>
|
|
<DT id="6"><A HREF="#lbAG">PCRE 32-BIT API 32-BIT-ONLY FUNCTION</A><DD>
|
|
<DT id="7"><A HREF="#lbAH">THE PCRE 32-BIT LIBRARY</A><DD>
|
|
<DT id="8"><A HREF="#lbAI">THE HEADER FILE</A><DD>
|
|
<DT id="9"><A HREF="#lbAJ">THE LIBRARY NAME</A><DD>
|
|
<DT id="10"><A HREF="#lbAK">STRING TYPES</A><DD>
|
|
<DT id="11"><A HREF="#lbAL">STRUCTURE TYPES</A><DD>
|
|
<DT id="12"><A HREF="#lbAM">32-BIT FUNCTIONS</A><DD>
|
|
<DT id="13"><A HREF="#lbAN">SUBJECT STRING OFFSETS</A><DD>
|
|
<DT id="14"><A HREF="#lbAO">NAMED SUBPATTERNS</A><DD>
|
|
<DT id="15"><A HREF="#lbAP">OPTION NAMES</A><DD>
|
|
<DT id="16"><A HREF="#lbAQ">CHARACTER CODES</A><DD>
|
|
<DT id="17"><A HREF="#lbAR">ERROR NAMES</A><DD>
|
|
<DT id="18"><A HREF="#lbAS">ERROR TEXTS</A><DD>
|
|
<DT id="19"><A HREF="#lbAT">CALLOUTS</A><DD>
|
|
<DT id="20"><A HREF="#lbAU">TESTING</A><DD>
|
|
<DT id="21"><A HREF="#lbAV">NOT SUPPORTED IN 32-BIT MODE</A><DD>
|
|
<DT id="22"><A HREF="#lbAW">AUTHOR</A><DD>
|
|
<DT id="23"><A HREF="#lbAX">REVISION</A><DD>
|
|
</DL>
|
|
<HR>
|
|
This document was created by
|
|
<A HREF="/cgi-bin/man/man2html">man2html</A>,
|
|
using the manual pages.<BR>
|
|
Time: 00:05:51 GMT, March 31, 2021
|
|
</BODY>
|
|
</HTML>
|