import sanitizeHtml from 'sanitize-html';
import Parser from "rss-parser";
import { fetcher } from './data-utils'
import cheerio from "cheerio";
import { getFilenameFromUrl } from './file-utils';

const parser = new Parser();

const apiUrl = process.env.API_URL || process.env.REACT_APP_API_URL || process.env.NEXT_PUBLIC_API_URL;

export async function fetchBlog(url, srcDomain, onUpdate, useProxy, isDebugging = false) {
    if (!url) throw new Error('url missing');
    if (!srcDomain) throw new Error('srcDomain missing');
    if (typeof (onUpdate) !== 'function') throw new Error('onUpdate missing');

    onUpdate({ message: 'blog:import:begin', data: url });

    let websiteMetadata;
    let websiteType;
    let websiteFinalUrl;

    try {
        websiteMetadata = await getWebsiteMetadata(url, onUpdate, useProxy);
        websiteType = websiteMetadata.websiteType;
        websiteFinalUrl = websiteMetadata.finalUrl;
    } catch (e) {
        const rethrow = new Error('error fetching website metadata for: ' + url);
        rethrow.stack = e.stack;
        throw rethrow;
    }

    let posts;
    try {
        posts = await fetchBlogPostsRss(url, websiteType, onUpdate, useProxy, isDebugging);
        for (let post of posts) {
            const categories = (post.categories || []);

            // i saw someone who had this as thier 1 tag
            // "INSPIRATION, POETRY. REBIRTH, MIND, SELF-LOVE, UNCATEGORIZED"
            // so i guess this will handle that and split it as 1.

            const splitCategories = categories.map(c => c.split(/,\s*/g));

            const flatCategories = splitCategories.flat();

            post.categories = flatCategories;
        }
    } catch (e) {
        const rethrow = new Error('error fetching posts for: ' + url);
        rethrow.stack = e.stack;
        throw rethrow;
    }

    let comments;
    try {
        comments = await fetchBlogCommentsRss(posts, websiteType, onUpdate, useProxy);
    } catch (e) {
        const rethrow = new Error('error fetching comments for: ' + url);
        rethrow.stack = e.stack;
        throw rethrow;
    }

    const assets = [];
    try {
        // loop through assets
        // TODO - CONVERT THIS TO A STANDARDIZED SCHEMA.
        // E.G. WORDPRESS HAS CONTENT:ENCODED, OTHERS HAVE JUST `CONTENT`, ETC.
        const allHtml = posts
            // .slice(0, 1)
            .map(np => np['content:encoded'] || np['content'])
            .join('\n\n');

        const extractedAssets = await extractPostAssets(allHtml, srcDomain);

        for (let asset of extractedAssets/*.slice(0, 1)*/) {
            assets.push(asset);
            onUpdate({ message: 'blog:asset:found', data: asset });
        }

    } catch (e) {
        const rethrow = new Error('error extracting post assets for: ' + url);
        rethrow.stack = e.stack;
        throw rethrow;
    }

    const data = { posts, comments, assets, meta: websiteMetadata };

    onUpdate({ message: 'blog:import:complete', data });

    return data;
}

async function getWebsiteMetadata(url, onUpdate, useProxy) {
    if (!url) throw new Error('url missing');
    if (typeof (onUpdate) !== 'function') throw new Error('onUpdate missing');

    // was getting this error:
    // Response for preflight is invalid (redirect)
    // https://stackoverflow.com/questions/42168773/how-to-resolve-preflight-is-invalid-redirect-or-redirect-is-not-allowed-for
    // MAKE SURE URL ENDS WITH /  ---- OR DOES NOT END WITH SLASH?? 
    // removing ending slash from proxy/ fixes CORS.
    const res = await getWebsiteResponse(url, useProxy);
    const resText = await res.text();
    const resUrl = res.url;

    onUpdate({ message: 'blog:connected:success.', data: url });

    let websiteType;

    const wordpressRegex = /wp-admin/g;
    const wixRegex = /wix/g;

    if (resText.match(wordpressRegex)?.length) {
        websiteType = 'wordpress';
    } else if (resText.match(wixRegex)?.length > 10) {
        websiteType = 'wix';
    }

    onUpdate({ message: 'blog:type:identified', data: websiteType });

    const retVal = { websiteType, finalUrl: resUrl };
    return retVal;
}

async function getWebsiteResponse(url, useProxy) {
    let fetchUrl = url;
    if (useProxy) {
        const proxiedUrl = `${apiUrl}/proxy?url=${url}/`;
        fetchUrl = proxiedUrl;
    }

    const res = await fetcher(fetchUrl, undefined, { getJson: false });
    return res;
}

export async function fetchBlogPostsRss(url, websiteType, onUpdate, useProxy, isDebugging) {
    if (!url) throw new Error('url missing');
    if (!websiteType) throw new Error('websiteType missing');
    if (typeof (onUpdate) !== 'function') throw new Error('onUpdate missing');

    // was getting this error:
    // Response for preflight is invalid (redirect)
    // https://stackoverflow.com/questions/42168773/how-to-resolve-preflight-is-invalid-redirect-or-redirect-is-not-allowed-for
    // MAKE SURE URL ENDS WITH /  ---- OR DOES NOT END WITH SLASH?? 
    // removing ending slash from proxy/ fixes CORS.
    let fetchUrl = url;
    if (useProxy) {
        const proxiedUrl = `${apiUrl}/proxy?url=${url}/`;
        fetchUrl = proxiedUrl;
    }

    const feed = [];

    if (websiteType == 'wordpress') {
        let page = 0;

        while (true) {

            if (isDebugging) {
                if (page > 0) {
                    break;
                }
            }

            // https://stackoverflow.com/questions/24381480/remove-duplicate-forward-slashes-from-the-url
            const wordpressBlogFeedUrl = `${fetchUrl}/feed?paged=${page}`.replace(/([^:]\/)\/+/g, "$1");
            try {
                const pagedFeed = await getFeed(wordpressBlogFeedUrl);
                // NOTE FOR SOME REASON IN THE CHROME DEBUGGER THIS PAGEFEED IS UNDEFINED WTH???

                if (pagedFeed?.items?.length) {
                    pagedFeed.items.forEach(item => {
                        onUpdate({ message: 'blog:post:found', data: item });
                        feed.push(item);
                    });
                } else {
                    break;
                }

            } catch (err) {
                const rethrow = new Error('error getting blog for post page: ' + wordpressBlogFeedUrl);
                rethrow.stack = err.stack;

                // if 404
                if (/404/.test(err.stack)) {
                    // not able to get upstream error code - feedParser swallows it.
                    // if (true) {
                    if (page == 0) {
                        // if first time running the app and 404, throw error.
                        throw rethrow;
                    } else {
                        // if running next page, but 404, that's ok.
                        break;
                    }
                } else {
                    throw rethrow;
                }
            }

            page++;
        }
    } else if (websiteType == 'wix') {
        // https://stackoverflow.com/questions/24381480/remove-duplicate-forward-slashes-from-the-url
        const wixBlogFeedUrl = `${fetchUrl}/blog-feed.xml`.replace(/([^:]\/)\/+/g, "$1");

        try {
            const pagedFeed = await getFeed(wixBlogFeedUrl);

            // NOTE FOR SOME REASON IN THE CHROME DEBUGGER THIS PAGEFEED IS UNDEFINED WTH???
            if (pagedFeed?.items?.length) {
                // DO NOT PUT ASYNC IN A .FOREACH() FUNC 
                // https://stackoverflow.com/questions/34365952/async-await-not-waiting
                // pagedFeed.items.forEach(async item => {})...

                for (let item of pagedFeed.items) {
                    // ensures that we import from old site, not new one if they're already on WIP.
                    const link = `${url}/post/${item.guid}`;
                    const categories = await getWixBlogPostTags(link, useProxy);

                    item.categories = categories;

                    onUpdate({ message: 'blog:post:found', data: item });

                    feed.push(item);
                }
            }

        } catch (err) {
            const rethrow = new Error('error getting blog for post page: ' + wixBlogFeedUrl);
            rethrow.stack = err.stack;
            throw rethrow;
        }
    }


    return feed;
}

export async function fetchBlogCommentsRss(posts, websiteType, onUpdate, useProxy) {
    if (!posts) throw new Error('posts missing');
    if (!websiteType) throw new Error('websiteType missing');
    if (typeof (onUpdate) !== 'function') throw new Error('onUpdate missing');

    const returnFeed = {};

    if (websiteType == 'wordpress') {
        // todo - migth need to follow the `while loop` pattern for getting posts. not sure if comments are capped at 20.
        // it was not working with this blog post
        // https://rmnewman.wordpress.com/2021/04/27/2021-a-progressive-poem/ -- only 10 max. ?paged=2 did not seem to help.
        // https://wordpress.stackexchange.com/questions/4736/get-all-posts-in-rss
        for (let post of posts) {
            // proxying this request does not work. the proxy does not handle the 301 redirect from wordpress. 
            // we are creating the 301 redirect ourselves. not sure if it's safe for all wp blogs
            // const commentsFeed = post.guid + "&feed=rss2";
            const commentsFeed = post.link + "feed";
            const url = commentsFeed;

            // was getting this error:
            // Response for preflight is invalid (redirect)
            // https://stackoverflow.com/questions/42168773/how-to-resolve-preflight-is-invalid-redirect-or-redirect-is-not-allowed-for
            // MAKE SURE URL ENDS WITH /  ---- OR DOES NOT END WITH SLASH?? 
            // removing ending slash from proxy/ fixes CORS.
            let fetchUrl = url;
            if (useProxy) {
                const proxiedUrl = `${apiUrl}/proxy?url=${url}`; // DO NOT HAVE ENDING SLASH - WORDPRESS DOES NOT LIKE
                // https://www.wildinkmarketing.com/?p=1779&feed=rss2
                // https://www.wildinkmarketing.com/?p=1779&feed=rss2/ <-- NO
                fetchUrl = proxiedUrl;
            }

            const wordpressCommentFeedUrl = `${fetchUrl}`.replace(/([^:]\/)\/+/g, "$1");

            try {
                const feed = await getFeed(wordpressCommentFeedUrl);

                if (feed?.items?.length) {
                    returnFeed[post.guid] = [];

                    feed.items.forEach(item => {
                        onUpdate({ message: 'blog:comment:found', data: item });
                        returnFeed[post.guid].push(item);
                    });
                } else {
                    break;
                }

            } catch (err) {
                const rethrow = new Error('error getting comments for post page: ' + wordpressCommentFeedUrl);
                rethrow.stack = err.stack;
                throw rethrow;
            }
        }
    }
    // find a way to import wix comments
    // else if (websiteType == 'wix') {
    //     // https://stackoverflow.com/questions/24381480/remove-duplicate-forward-slashes-from-the-url
    //     const wixBlogFeedUrl = `${resUrl}/blog-feed.xml`.replace(/([^:]\/)\/+/g, "$1");

    //     try {
    //         const pagedFeed = await getFeed(wixBlogFeedUrl);

    //         if (pagedFeed?.items?.length) {
    //             pagedFeed.items.forEach(item => {
    //                 onUpdate({ message: 'blog:post:found', data: item });
    //                 feed.push(item);
    //             });
    //         }

    //     } catch (err) {
    //         const rethrow = new Error('error getting blog for post page: ' + wixBlogFeedUrl);
    //         rethrow.stack = err.stack;
    //         throw rethrow;
    //     }
    // }

    return returnFeed;
}

async function getFeed(feedUrl) {
    if (!feedUrl) throw new Error('feedUrl missing');

    try {
        // I was getting very weird inconsisten results with parser.parseURL
        // on the first call of parseURL('http://www.authorsite.com:3000/api/proxy?url=https://rmnewman.wordpress.com/2021/04/27/2021-a-progressive-poem/feed');
        // i would get 10 results but if i do this in chrome or cURL or whatever, or just reopening the link, i'd get 14.
        // update: when i removed www. from the proxy, it worked!
        // i noticed with the www. version, postman was hanging although the chrome browser was not
        // i have sinced removed www.authorsite.com from hosts file

        // const res = await fetcher(feedUrl, undefined, { getJson: false });
        // const text = await res.text();
        // const xmlString = text;
        // const feed = await parser.parseString(xmlString);
        const feed = await parser.parseURL(feedUrl);
        return feed;
    } catch (e) {
        const rethrow = new Error('error getting feed for page: ' + feedUrl);
        rethrow.stack = e.stack;
        throw rethrow;
    }
}

export function extractPostContent(html) {
    debugger
    // https://github.com/apostrophecms/sanitize-html
    //https://github.com/apostrophecms/sanitize-html#wildcards-for-attributes
    // Allow only a super restricted set of tags and attributes
    // const allowedAttributes = {
    //     ...sanitizeHtml.defaults.allowedAttributes,
    //     '*': ['style']
    // };

    const sanitizedHtml = sanitizeHtml(html, {
        allowedTags: false,

        // at first I just used the defaults, but came across blog posts that used background-image url() instead of <img> tags.
        // so we need to include style i guess… we might need to add a filter option depending on how crazy styles become.
        // update: i ended up commenting this out. our editor doesn't support this very well, background iamges that is.
        // see https://app.asana.com/0/1117768121049462/1200488050643570.
        // allowedAttributes,

        // exclusive filter won't work, as the author suggests cheerio isntead to remove empty elements.
        // https://github.com/apostrophecms/sanitize-html/issues/58
        // https://github.com/apostrophecms/sanitize-html/pull/349
        // https://github.com/apostrophecms/sanitize-html/issues/313
        // https://github.com/apostrophecms/sanitize-html/pull/356

        // exclusiveFilter: function (frame) {
        //     // allow all, true = exclude me
        //     let retVal = false;

        //     if (frame.mediaChildren.length == 0) {

        //         if (!sanitizeHtml.defaults.selfClosing.includes(frame.tag)) {
        //             retVal = frame.text.trim().length == 0;
        //         } else {
        //             const x = sanitizeHtml;
        //         }
        //     }

        //     return retVal;
        // },
    });

    const noTabs = sanitizedHtml.replace(/\t/g, '');

    // strip newlines
    // https://stackoverflow.com/questions/6360566/replace-multiple-newlines-tabs-and-spaces
    const removeMultiNewline = noTabs.replace(/(\\r|\\n){2,}/g, '\n');

    const strippedHtml = stripEmptyHtml(removeMultiNewline);

    return strippedHtml;
}

// https://stackoverflow.com/questions/6092855/how-do-i-remove-empty-p-tags-with-jquery/6092882
function stripEmptyHtml(html) {
    // https://github.com/cheeriojs/cheerio/issues/1031#issuecomment-748677236
    const $ = cheerio.load(html, undefined, false);

    // https://stackoverflow.com/questions/1394020/jquery-each-backwards
    $($('*').get().reverse()).each(function (index, el) {
        const $el = $(el);

        if ($el.html().replace(/\s|&nbsp;/g, '').length == 0) {

            if (!sanitizeHtml.defaults.selfClosing.includes(el.name)) {

                $el.remove();
            }
        }
    });

    const retVal = $.html().trim();

    return retVal;
}

export function extractPostAssets(html, srcDomain) {
    if (!html) throw new Error('html missing');
    if (!srcDomain) throw new Error('srcDomain missing');

    // https://github.com/cheeriojs/cheerio/issues/1031#issuecomment-748677236
    const $ = cheerio.load(html, undefined, false);

    // todo make sure srcDomain escapes any special chars
    // https://stackoverflow.com/questions/17885855/use-dynamic-variable-string-as-regex-pattern-in-javascript
    const escapedSrcDomain = srcDomain.replace('.', '\.').toLowerCase();
    const srcDomainRegex = new RegExp(`(${escapedSrcDomain}|files\.wordpress\.com|static\.wixstatic\.com)`, 'i');

    // https://rmnewman.files.wordpress.com/2021/04/kidlitosphere-progressive-poem-2021.png
    // https://static.wixstatic.com/media/109702687f5c4c38986333623d19b652.jpg/v1/fill/w_1480,h_988,al_c,q_90/109702687f5c4c38986333623d19b652.webp
    const images = $('img').toArray();

    const filterImages = images
        .map(el => el.attribs['src'])
        .filter(src => {
            const retVal = srcDomainRegex.test(src);

            return retVal;
        });

    return filterImages;
}

export async function fetchPostAsset(assetUrl) {
    if (!assetUrl) throw new Error('assetUrl missing');

    // https://stackoverflow.com/questions/37686213/upload-image-from-url-to-firebase-storage
    const res = await fetcher(assetUrl, undefined, { rawResponse: true });
    const blob = await res.blob();

    return blob;
}

async function getWixBlogPostTags(wixBlogPostUrl, useProxy) {
    try {
        const res = await getWebsiteResponse(wixBlogPostUrl, useProxy);
        const html = await res.text();

        const $ = cheerio.load(html);

        const tags = $('[aria-label=tags]').find('li').toArray().map(el => $(el).text());

        return tags;
    } catch (e) {
        const rethrow = new Error('error fetching website metadata for: ' + url);
        rethrow.stack = e.stack;
        throw rethrow;
    }
}
