<?xml version="1.0" encoding="utf-8"?>
<?xml-model href="rfc7991bis.rnc"?>
<!DOCTYPE rfc [
        <!ENTITY nbsp "&#160;">
        <!ENTITY zwsp "&#8203;">
        <!ENTITY nbhy "&#8209;">
        <!ENTITY wj "&#8288;">
        ]>
<rfc xmlns:xi="http://www.w3.org/2001/XInclude"
     category="info"
     docName="draft-improving-data-quality-tags-00"
     ipr="trust200902"
     obsoletes=""
     consensus="false"
     submissionType="IETF"
     xml:lang="en"
     version="3">

    <front>
        <title>Improving Data Quality through Special Text Tags</title>
        <author fullname="Aleksey Ovcharenko" initials="A." surname="Ovcharenko">
            <address>
                <email>aleksey.ovcharenko@gmail.com</email>
            </address>
        </author>
        <date year="2023"/>
        <area>General</area>
        <workgroup>Internet Engineering Task Force</workgroup>

        <keyword>data quality</keyword>
        <keyword>conversational AI</keyword>
        <keyword>text tagging</keyword>

        <abstract>
            <t>
                This document proposes the use of special text tags to enhance data quality and improve the
                understanding of user queries in conversational AI models. By incorporating these tags, models can
                benefit from additional context and structure during training and inference, leading to more accurate
                and relevant responses.
            </t>
        </abstract>
    </front>

    <middle>
        <section>
            <name>Introduction</name>
            <t>
                Conversational AI models often face challenges in data collection and text parsing, impacting their
                performance and reliability. This proposal aims to address these challenges by introducing special text
                tags. This approach draws inspiration from related works in natural language processing, information
                retrieval, and conversational AI.
            </t>
        </section>

        <section>
            <name>Motivation</name>
            <t>
                The motivation behind this proposal is to improve the quality of training data and enhance the
                understanding of user queries by incorporating special text tags. The idea is influenced by research on
                intent recognition, entity extraction, and context modeling in natural language understanding. Notable
                works include:
            </t>
            <ul>
                <li>
                    Previous studies on intent recognition in dialogue systems have
                    explored the use of intent tags to improve the accuracy of responses<xref
                        target="intent-recognition"/>.
                </li>
                <li>
                    Named Entity Recognition (NER) techniques have been widely studied
                    and applied in information extraction tasks. These approaches inspire the entity tagging component
                    proposed in this study<xref target="gibbs-sampling"/>.
                </li>
                <li>
                    Research on dialogue modeling has emphasized the importance
                    of context and sequential information in generating coherent responses. Contextual tags introduced
                    in this proposal draw inspiration from these studies<xref target="contextual-understanding"/>.
                </li>
            </ul>
        </section>

        <section>
            <name>Specification</name>
            <section anchor="intent-tagging">
                <name>Intent Tagging</name>
                <t>
                    Intent tags are used to label the intent or purpose of user queries, providing guidance to the model
                    in generating more contextually appropriate responses.
                </t>
                <ul>
                    <li>
                        [intent-def]: For queries seeking definitions of terms.
                    </li>
                    <li>
                        [intent-comp]: For queries comparing two or more entities.
                    </li>
                    <li>
                        [intent-ex]: For queries requesting examples or instances.
                    </li>
                    <li>
                        [intent-steps]: For queries seeking step-by-step instructions.
                    </li>
                    <li>
                        [intent-adv-disadv]: For queries exploring the pros and cons of a topic.
                    </li>
                </ul>
            </section>
            <section anchor="entity-tagging">
                <name>Entity Tagging</name>
                <t>
                    Entity tags are used to identify and label specific entities within the text, improving the model's
                    understanding of user queries related to those entities.
                </t>
                <ul>
                    <li>
                        [entity-person]: For queries related to people or individuals.
                    </li>
                    <li>
                        [entity-organization]: For queries related to organizations or companies.
                    </li>
                    <li>
                        [entity-location]: For queries related to specific locations.
                    </li>
                    <li>
                        [entity-date]: For queries related to dates or time.
                    </li>
                    <li>
                        [entity-product]: For queries related to products or items.
                    </li>
                </ul>
            </section>
            <section anchor="contextual-tags">
                <name>Contextual Tags</name>
                <t>
                    Contextual tags mark contextual information, providing cues for maintaining a coherent and
                    context-aware conversation.
                </t>
                <ul>
                    <li>
                        [context-background]: For providing background information or context.
                    </li>
                    <li>
                        [context-constraints]: For indicating limitations or constraints.
                    </li>
                    <li>
                        [context-previous-query]: For referring to a previous user query or conversation
                        context.
                    </li>
                    <li>
                        [context-next-steps]: For suggesting the next steps in a process or task.
                    </li>
                    <li>
                        [context-clarification]: For seeking clarification or additional details.
                    </li>
                </ul>
            </section>
            <section anchor="quality-assessment-tags">
                <name>Quality Assessment Tags</name>
                <t>
                    Quality assessment tags help identify the quality or reliability of information, enabling the model
                    to generate more cautious and reliable responses.
                </t>
                <ul>
                    <li>
                        [qa-biased]: Indicating biased information.
                    </li>
                    <li>
                        [qa-unverified]: Denoting information that is not verified or lacks credibility.
                    </li>
                    <li>
                        [qa-misleading]: Highlighting information that may be misleading or deceptive.
                    </li>
                    <li>
                        [qa-outdated]: Identifying information that is outdated or no longer accurate.
                    </li>
                    <li>
                        [qa-fact-check-needed]: Flagging information that requires fact-checking.
                    </li>
                </ul>
            </section>
            <section anchor="emotion-tone-markers">
                <name>Emotion or Tone Markers</name>
                <t>
                    Emotion or tone markers indicate the emotional or tonal aspects of the text, enabling the model to
                    generate more appropriate and empathetic responses.
                </t>
                <ul>
                    <li>
                        [tone-positive]: Denoting a positive emotional tone.
                    </li>
                    <li>
                        [tone-negative]: Indicating a negative emotional tone.
                    </li>
                    <li>
                        [tone-neutral]: Denoting a neutral or unbiased tone.
                    </li>
                    <li>
                        [tone-joy]: Indicating a joyful or happy emotion.
                    </li>
                    <li>
                        [tone-sadness]: Denoting a sad or sorrowful emotion.
                    </li>
                </ul>
            </section>
        </section>

        <section anchor="IANA">
            <name>IANA Considerations</name>
            <t>This memo includes no request to IANA.</t>
        </section>

        <section anchor="security-considerations">
            <name>Security Considerations</name>
            <t>
                The security considerations section highlights that implementing special text tags does not introduce
                inherent security risks. However, it emphasizes the need to ensure secure and privacy-conscious
                practices during the tagging process and data collection, adhering to existing guidelines<xref
                    target="usage-policies"/>.
            </t>
        </section>

        <section anchor="interoperability">
            <name>Interoperability</name>
            <t>
                Interoperability is crucial for the widespread adoption of special text tags. This section recognizes
                the importance of standardization efforts to ensure consistent usage and interpretation of tags across
                different conversational AI models and platforms. It encourages collaboration with standardization
                bodies and references existing efforts in the field<xref target="caml-dialogue-systems"/>.
            </t>
        </section>

        <section anchor="implementation-deployment">
            <name>Implementation and Deployment</name>
            <t>
                The implementation and deployment section discuss the practical aspects of integrating special text
                tags. It suggests involving human annotators or domain experts to accurately tag training data,
                modifying training processes to consider the tags, and updating inference systems to interpret and
                respond to tagged user queries effectively.
            </t>
        </section>

        <section anchor="conclusion">
            <name>Conclusion</name>
            <t>
                The proposed special text tags offer a structured approach to enrich the training data of conversational
                AI models. By incorporating these tags, models can improve data quality, enhance understanding of user
                queries, and generate more accurate and contextually relevant responses. The conclusion section
                summarizes the potential benefits and encourages further research and experimentation.
            </t>
        </section>
    </middle>


    <back>
        <references>
            <name>Informative References</name>
            <reference anchor="intent-recognition" target="https://www.cs.cornell.edu/~kilian/papers/msdadomain.pdf">
                <front>
                    <title>Marginalized Denoising Autoencoders for Domain Adaptation</title>
                    <author initials="M." surname="Chen">
                        <organization/>
                    </author>
                    <author initials="Z." surname="Xu">
                        <organization/>
                    </author>
                    <author initials="K." surname="Weinberger">
                        <organization/>
                    </author>
                    <author initials="O." surname="Chapelle">
                        <organization/>
                    </author>
                    <date year="2012"/>
                </front>
            </reference>
            <reference anchor="gibbs-sampling" target="https://www.aclweb.org/anthology/P/P05/P05-1045.pdf">
                <front>
                    <title>Incorporating Non-local Information into Information Extraction Systems by Gibbs
                        Sampling
                    </title>
                    <author initials="J. R." surname="Finkel">
                        <organization/>
                    </author>
                    <author initials="T." surname="Grenager">
                        <organization/>
                    </author>
                    <author initials="C." surname="Manning">
                        <organization/>
                    </author>
                    <date year="2005"/>
                </front>
            </reference>
            <reference anchor="contextual-understanding" target="https://www.aclweb.org/anthology/D/D11/D11-1145.pdf">
                <front>
                    <title>Data-driven Response Generation in Social Media</title>
                    <author initials="A." surname="Ritter">
                        <organization/>
                    </author>
                    <author initials="C." surname="Cherry">
                        <organization/>
                    </author>
                    <author initials="B." surname="Dolan">
                        <organization/>
                    </author>
                    <date year="2011"/>
                </front>
            </reference>
            <reference anchor="usage-policies" target="https://openai.com/policies/usage-policies">
                <front>
                    <title>Usage policies</title>
                    <author>
                        <organization>OpenAI</organization>
                    </author>
                    <date year="2021"/>
                </front>
            </reference>
            <reference anchor="caml-dialogue-systems" target="https://citeseerx.ist.psu.edu/doc/10.1.1.1086.4050">
                <front>
                    <title>CAML - A Universal Configuration Language for Dialogue Systems</title>
                    <author initials="G." surname="Kovasznai">
                        <organization/>
                    </author>
                    <author initials="C." surname="Kotropoulos">
                        <organization/>
                    </author>
                    <author initials="I." surname="Pitas">
                        <organization/>
                    </author>
                </front>
            </reference>
        </references>
    </back>
</rfc>
