Reputation: 110550
I am developing a c/c++ program on linux. Can you please tell me if there is any c/c++ library which decodes url?
I am looking for libraries which convert "http%3A%2F%2F" to: "http://"
or "a+t+%26+t" to "a t & t"
Thank you.
Upvotes: 15
Views: 43334
Reputation: 51
Thanks to @ThomasH for his answer. I'd like to propose here a better formattation…
And… since the decoded URI component is always less long than the same encoded URI component, is always possible to implode it within the same array of characters (a.k.a.: "string"). So, I'll propose here two possibilities:
#include <stdio.h>
#include <ctype.h>
#include <limits.h>
int decodeURIComponent (char *sSource, char *sDest) {
int nLength;
for (nLength = 0; *sSource; nLength++) {
if (*sSource == '%' && sSource[1] && sSource[2] && isxdigit(sSource[1]) && isxdigit(sSource[2])) {
sSource[1] -= sSource[1] <= '9' ? '0' : (sSource[1] <= 'F' ? 'A' : 'a')-10;
sSource[2] -= sSource[2] <= '9' ? '0' : (sSource[2] <= 'F' ? 'A' : 'a')-10;
sDest[nLength] = 16 * sSource[1] + sSource[2];
sSource += 3;
continue;
}
sDest[nLength] = *sSource++;
}
sDest[nLength] = '\0';
return nLength;
}
#define implodeURIComponent(url) decodeURIComponent(url, url)
And, finally…:
int main () {
char sMyUrl[] = "http%3a%2F%2ffoo+bar%2fabcd";
int nNewLength = implodeURIComponent(sMyUrl);
/* Let's print: "http://foo+bar/abcd\nLength: 19" */
printf("%s\nLength: %d\n", sMyUrl, nNewLength);
return 0;
}
Ste*
Upvotes: 2
Reputation: 22134
Here is a C decoder for a percent encoded string. It returns -1 if the encoding is invalid and 0 otherwise. The decoded string is stored in out. I'm quite sure this is the fastest code of the answers given so far.
int percent_decode(char* out, const char* in) {
{
static const char tbl[256] = {
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
-1,10,11,12,13,14,15,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,10,11,12,13,14,15,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1
};
char c, v1, v2, *beg=out;
if(in != NULL) {
while((c=*in++) != '\0') {
if(c == '%') {
if((v1=tbl[(unsigned char)*in++])<0 ||
(v2=tbl[(unsigned char)*in++])<0) {
*beg = '\0';
return -1;
}
c = (v1<<4)|v2;
}
*out++ = c;
}
}
*out = '\0';
return 0;
}
Upvotes: 10
Reputation: 11
Came across this 8 year old question as I was looking for the same. Based on previous answers, I also wrote my own version which is independent from libs, easy to understand and probably fast (no benchmark). Tested code with gcc, it should decode until end or invalid character (not tested). Just remember to free allocated space.
const char ascii_hex_4bit[23] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15};
static inline char to_upper(char c)
{
if ((c >= 'a') && (c <= 'z')) return c ^ 0x20;
return c;
}
char *url_decode(const char *str)
{
size_t i, j, len = strlen(str);
char c, d, url_hex;
char *decoded = malloc(len + 1);
if (decoded == NULL) return NULL;
i = 0;
j = 0;
do
{
c = str[i];
d = 0;
if (c == '%')
{
url_hex = to_upper(str[++i]);
if (((url_hex >= '0') && (url_hex <= '9')) || ((url_hex >= 'A') && (url_hex <= 'F')))
{
d = ascii_hex_4bit[url_hex - 48] << 4;
url_hex = to_upper(str[++i]);
if (((url_hex >= '0') && (url_hex <= '9')) || ((url_hex >= 'A') && (url_hex <= 'F')))
{
d |= ascii_hex_4bit[url_hex - 48];
}
else
{
d = 0;
}
}
}
else if (c == '+')
{
d = ' ';
}
else if ((c == '*') || (c == '-') || (c == '.') || ((c >= '0') && (c <= '9')) ||
((c >= 'A') && (c <= 'Z')) || (c == '_') || ((c >= 'a') && (c <= 'z')))
{
d = c;
}
decoded[j++] = d;
++i;
} while ((i < len) && (d != 0));
decoded[j] = 0;
return decoded;
}
Upvotes: 1
Reputation: 399813
The ever-excellent glib has some URI functions, including scheme-extraction, escaping and un-escaping.
Upvotes: 3
Reputation: 1165
I actually used Saul's function in an analysis program I was writing (analyzing millions of URL encoded strings), and while it works, at that scale it was slowing my program down horribly, so I decided to write a faster version. This one is thousands of times faster when compiled with GCC and the -O2 option. It can also use the same output buffer as the input (e.g. urldecode2(buf, buf) will work if the original string was in buf and is to be overwritten by its decoded counterpart).
Edit: It doesn't take the buffer size as an input because it is assumed that the buffer will be large enough, this is safe because it is known that the length of the output will always be <= that of the input, so either use the same buffer for the output or create one that's at least the size of the input + 1 for the null terminator, e.g.:
char *output = malloc(strlen(input)+1);
urldecode2(output, input);
printf("Decoded string: %s\n", output);
Edit 2: An anonymous user attempted to edit this answer to handle the '+' character's translation to ' ', which I think it should probably do, again this wasn't something that I needed for my application, but I've added it below.
Here's the routine:
#include <stdlib.h>
#include <ctype.h>
void urldecode2(char *dst, const char *src)
{
char a, b;
while (*src) {
if ((*src == '%') &&
((a = src[1]) && (b = src[2])) &&
(isxdigit(a) && isxdigit(b))) {
if (a >= 'a')
a -= 'a'-'A';
if (a >= 'A')
a -= ('A' - 10);
else
a -= '0';
if (b >= 'a')
b -= 'a'-'A';
if (b >= 'A')
b -= ('A' - 10);
else
b -= '0';
*dst++ = 16*a+b;
src+=3;
} else if (*src == '+') {
*dst++ = ' ';
src++;
} else {
*dst++ = *src++;
}
}
*dst++ = '\0';
}
Upvotes: 34
Reputation: 5088
I'd suggest curl and libcurl. It's widely used and should do the trick for you. Just check their website.
Upvotes: 2
Reputation: 5787
Try urlcpp https://github.com/larroy/urlcpp It's a C++ module that you can easily integrate in your project, depends on boost::regex
Upvotes: 1
Reputation: 61
/**
* Locale-independent conversion of ASCII characters to lowercase.
*/
int av_tolower(int c)
{
if (c >= 'A' && c <= 'Z')
c ^= 0x20;
return c;
}
/**
* Decodes an URL from its percent-encoded form back into normal
* representation. This function returns the decoded URL in a string.
* The URL to be decoded does not necessarily have to be encoded but
* in that case the original string is duplicated.
*
* @param url a string to be decoded.
* @return new string with the URL decoded or NULL if decoding failed.
* Note that the returned string should be explicitly freed when not
* used anymore.
*/
char *urldecode(const char *url)
{
int s = 0, d = 0, url_len = 0;
char c;
char *dest = NULL;
if (!url)
return NULL;
url_len = strlen(url) + 1;
dest = av_malloc(url_len);
if (!dest)
return NULL;
while (s < url_len) {
c = url[s++];
if (c == '%' && s + 2 < url_len) {
char c2 = url[s++];
char c3 = url[s++];
if (isxdigit(c2) && isxdigit(c3)) {
c2 = av_tolower(c2);
c3 = av_tolower(c3);
if (c2 <= '9')
c2 = c2 - '0';
else
c2 = c2 - 'a' + 10;
if (c3 <= '9')
c3 = c3 - '0';
else
c3 = c3 - 'a' + 10;
dest[d++] = 16 * c2 + c3;
} else { /* %zz or something other invalid */
dest[d++] = c;
dest[d++] = c2;
dest[d++] = c3;
}
} else if (c == '+') {
dest[d++] = ' ';
} else {
dest[d++] = c;
}
}
return dest;
}
by
www.elesos.com
Upvotes: 0
Reputation: 1002
This function I've just whipped up is very lightweight and should do as you wish, note I haven't programmed this to strict URI standards (used what I know off the top of my head). It's buffer-safe and doesn't overflow as far as I can see; adapt as you deem fit:
#include <assert.h>
void urldecode(char *pszDecodedOut, size_t nBufferSize, const char *pszEncodedIn)
{
memset(pszDecodedOut, 0, nBufferSize);
enum DecodeState_e
{
STATE_SEARCH = 0, ///< searching for an ampersand to convert
STATE_CONVERTING, ///< convert the two proceeding characters from hex
};
DecodeState_e state = STATE_SEARCH;
for(unsigned int i = 0; i < strlen(pszEncodedIn)-1; ++i)
{
switch(state)
{
case STATE_SEARCH:
{
if(pszEncodedIn[i] != '%')
{
strncat(pszDecodedOut, &pszEncodedIn[i], 1);
assert(strlen(pszDecodedOut) < nBufferSize);
break;
}
// We are now converting
state = STATE_CONVERTING;
}
break;
case STATE_CONVERTING:
{
// Conversion complete (i.e. don't convert again next iter)
state = STATE_SEARCH;
// Create a buffer to hold the hex. For example, if %20, this
// buffer would hold 20 (in ASCII)
char pszTempNumBuf[3] = {0};
strncpy(pszTempNumBuf, &pszEncodedIn[i], 2);
// Ensure both characters are hexadecimal
bool bBothDigits = true;
for(int j = 0; j < 2; ++j)
{
if(!isxdigit(pszTempNumBuf[j]))
bBothDigits = false;
}
if(!bBothDigits)
break;
// Convert two hexadecimal characters into one character
int nAsciiCharacter;
sscanf(pszTempNumBuf, "%x", &nAsciiCharacter);
// Ensure we aren't going to overflow
assert(strlen(pszDecodedOut) < nBufferSize);
// Concatenate this character onto the output
strncat(pszDecodedOut, (char*)&nAsciiCharacter, 1);
// Skip the next character
i++;
}
break;
}
}
}
Upvotes: 4